"src/sdk/vscode:/vscode.git/clone" did not exist on "5623dbf32952bd7842d3d9c5cf12a8e97e2b1fab"
Commit 2f268bc2 authored by Paul's avatar Paul
Browse files

Merge branch 'develop' into mlir-c

parents f75c5a38 aa7ff911
...@@ -176,7 +176,13 @@ template <index_int N, class T, class... Ts> ...@@ -176,7 +176,13 @@ template <index_int N, class T, class... Ts>
auto hip_vec_visit_all(T&& x, Ts&&... xs) auto hip_vec_visit_all(T&& x, Ts&&... xs)
{ {
return [&](auto f) { return [&](auto f) {
hip_visit_all_impl(get_shape(x), auto sx = get_shape(x);
auto lens = sx.lens();
assert(lens.back() % N == 0);
assert(sx.strides().back() == 1);
lens.back() /= N;
shape vec_sx{sx.type(), lens};
hip_visit_all_impl(vec_sx,
make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }), make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
f, f,
x, x,
......
...@@ -17,8 +17,10 @@ struct run_op : action<run_op> ...@@ -17,8 +17,10 @@ struct run_op : action<run_op>
auto name = v.at("name").to<std::string>(); auto name = v.at("name").to<std::string>();
if(not contains(name, "::")) if(not contains(name, "::"))
name = "gpu::" + name; name = "gpu::" + name;
auto op = make_op(name); auto op = make_op(name);
double t = time_op(ctx, op, inputs); if(v.contains("fields"))
op.from_value(v.at("fields"));
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl; std::cout << op << ": " << t << "ms" << std::endl;
} }
}; };
......
...@@ -11,11 +11,11 @@ namespace migraphx { ...@@ -11,11 +11,11 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
void eliminate_workspace::apply(module& p) const void eliminate_workspace::apply(module& m) const
{ {
std::size_t n = 0; std::size_t n = 0;
std::vector<instruction_ref> allocs; std::vector<instruction_ref> allocs;
for(auto ins : iterator_for(p)) for(auto ins : iterator_for(m))
{ {
if(ins->outputs().size() != 1) if(ins->outputs().size() != 1)
continue; continue;
...@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const ...@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const
} }
if(n > 0) if(n > 0)
{ {
auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}}); auto ws = m.add_parameter("workspace", shape{shape::int8_type, {n}});
for(auto&& a : allocs) for(auto&& a : allocs)
{ {
p.replace_instruction(a, ws); m.replace_instruction(a, ws);
p.remove_instruction(a); m.remove_instruction(a);
} }
} }
} }
......
...@@ -317,7 +317,7 @@ struct find_layernorm ...@@ -317,7 +317,7 @@ struct find_layernorm
{ {
auto matcher() const { return match::layernorm(&gpu_name); } auto matcher() const { return match::layernorm(&gpu_name); }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto x_ins = r.instructions["x"]; auto x_ins = r.instructions["x"];
...@@ -332,7 +332,7 @@ struct find_layernorm ...@@ -332,7 +332,7 @@ struct find_layernorm
if(relements > 1024 or (relements % 4 != 0 and relements > 256)) if(relements > 1024 or (relements % 4 != 0 and relements > 256))
return; return;
p.replace_instruction(ins, hip_layernorm{}, x_ins, args.back()); m.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
} }
}; };
...@@ -344,11 +344,11 @@ struct find_triadd_layernorm ...@@ -344,11 +344,11 @@ struct find_triadd_layernorm
match::used_once(), match::all_of[match::inputs()](match::standard_shape())))); match::used_once(), match::all_of[match::inputs()](match::standard_shape()))));
} }
void apply(module& p, const match::matcher_result& r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto triadd = ins->inputs().front(); auto triadd = ins->inputs().front();
p.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs()); m.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
} }
}; };
...@@ -356,13 +356,13 @@ struct find_gelu ...@@ -356,13 +356,13 @@ struct find_gelu
{ {
auto matcher() const { return match::gelu_erf(&gpu_name); } auto matcher() const { return match::gelu_erf(&gpu_name); }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto x_ins = r.instructions["x"]; auto x_ins = r.instructions["x"];
auto args = ins->inputs(); auto args = ins->inputs();
p.replace_instruction(ins, hip_gelu{}, x_ins, args.back()); m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
} }
}; };
...@@ -373,7 +373,7 @@ struct find_add_gelu ...@@ -373,7 +373,7 @@ struct find_add_gelu
return match::name("gpu::gelu")(match::arg(0)(match::name("gpu::add").bind("add"))); return match::name("gpu::gelu")(match::arg(0)(match::name("gpu::add").bind("add")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto add_ins = r.instructions["add"]; auto add_ins = r.instructions["add"];
auto ins = r.result; auto ins = r.result;
...@@ -382,7 +382,7 @@ struct find_add_gelu ...@@ -382,7 +382,7 @@ struct find_add_gelu
move_broadcasted_back(args); move_broadcasted_back(args);
args.back() = ins->inputs().back(); args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_add_gelu{}, args); m.replace_instruction(ins, hip_add_gelu{}, args);
} }
}; };
...@@ -392,16 +392,16 @@ struct find_gelu_new ...@@ -392,16 +392,16 @@ struct find_gelu_new
auto matcher() const { return match::gelu_tanh(&gpu_name); } auto matcher() const { return match::gelu_tanh(&gpu_name); }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto x_ins = r.instructions["x"]; auto x_ins = r.instructions["x"];
auto args = ins->inputs(); auto args = ins->inputs();
if(fast_math) if(fast_math)
p.replace_instruction(ins, hip_gelu{}, x_ins, args.back()); m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
else else
p.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back()); m.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
} }
}; };
...@@ -412,7 +412,7 @@ struct find_add_gelu_new ...@@ -412,7 +412,7 @@ struct find_add_gelu_new
return match::name("gpu::gelu_new")(match::arg(0)(match::name("gpu::add").bind("add"))); return match::name("gpu::gelu_new")(match::arg(0)(match::name("gpu::add").bind("add")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto add_ins = r.instructions["add"]; auto add_ins = r.instructions["add"];
auto ins = r.result; auto ins = r.result;
...@@ -421,7 +421,7 @@ struct find_add_gelu_new ...@@ -421,7 +421,7 @@ struct find_add_gelu_new
move_broadcasted_back(args); move_broadcasted_back(args);
args.back() = ins->inputs().back(); args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_add_gelu_new{}, args); m.replace_instruction(ins, hip_add_gelu_new{}, args);
} }
}; };
...@@ -436,7 +436,7 @@ struct find_add_clip ...@@ -436,7 +436,7 @@ struct find_add_clip
.bind("add"))); .bind("add")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto add_ins = r.instructions["add"]; auto add_ins = r.instructions["add"];
auto ins = r.result; auto ins = r.result;
...@@ -449,9 +449,9 @@ struct find_add_clip ...@@ -449,9 +449,9 @@ struct find_add_clip
add_args.pop_back(); add_args.pop_back();
add_args.insert(add_args.end(), std::next(ins_args.begin()), ins_args.end()); add_args.insert(add_args.end(), std::next(ins_args.begin()), ins_args.end());
if(add_ins->name() == "gpu::add") if(add_ins->name() == "gpu::add")
p.replace_instruction(ins, hip_add_clip{}, add_args); m.replace_instruction(ins, hip_add_clip{}, add_args);
else if(add_ins->name() == "gpu::triadd") else if(add_ins->name() == "gpu::triadd")
p.replace_instruction(ins, hip_triadd_clip{}, add_args); m.replace_instruction(ins, hip_triadd_clip{}, add_args);
} }
}; };
...@@ -471,7 +471,7 @@ struct find_add_unary ...@@ -471,7 +471,7 @@ struct find_add_unary
.bind("add"))); .bind("add")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto add_ins = r.instructions["add"]; auto add_ins = r.instructions["add"];
auto ins = r.result; auto ins = r.result;
...@@ -482,9 +482,9 @@ struct find_add_unary ...@@ -482,9 +482,9 @@ struct find_add_unary
// Use the allocation from the relu operator // Use the allocation from the relu operator
args.back() = ins->inputs().back(); args.back() = ins->inputs().back();
if(add_ins->name() == "gpu::add") if(add_ins->name() == "gpu::add")
p.replace_instruction(ins, binary_add_op, args); m.replace_instruction(ins, binary_add_op, args);
else if(add_ins->name() == "gpu::triadd") else if(add_ins->name() == "gpu::triadd")
p.replace_instruction(ins, ternary_add_op, args); m.replace_instruction(ins, ternary_add_op, args);
} }
}; };
...@@ -499,7 +499,7 @@ struct find_triadd ...@@ -499,7 +499,7 @@ struct find_triadd
.bind("input"))); .bind("input")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto add_ins = r.instructions["add"]; auto add_ins = r.instructions["add"];
auto input_ins = r.instructions["input"]; auto input_ins = r.instructions["input"];
...@@ -514,7 +514,7 @@ struct find_triadd ...@@ -514,7 +514,7 @@ struct find_triadd
move_broadcasted_back(args); move_broadcasted_back(args);
args.back() = ins->inputs().back(); args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_triadd{}, args); m.replace_instruction(ins, hip_triadd{}, args);
} }
}; };
...@@ -526,7 +526,7 @@ struct find_mul_add ...@@ -526,7 +526,7 @@ struct find_mul_add
match::name("gpu::mul")(match::used_once()).bind("mul"), match::any().bind("b"))); match::name("gpu::mul")(match::used_once()).bind("mul"), match::any().bind("b")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto mul_ins = r.instructions["mul"]; auto mul_ins = r.instructions["mul"];
auto b_ins = r.instructions["b"]; auto b_ins = r.instructions["b"];
...@@ -539,7 +539,7 @@ struct find_mul_add ...@@ -539,7 +539,7 @@ struct find_mul_add
args.insert(std::prev(args.end()), b_ins); args.insert(std::prev(args.end()), b_ins);
args.back() = ins->inputs().back(); args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_mul_add{}, args); m.replace_instruction(ins, hip_mul_add{}, args);
} }
}; };
...@@ -551,7 +551,7 @@ struct find_mul_add_relu ...@@ -551,7 +551,7 @@ struct find_mul_add_relu
match::arg(0)(match::name("gpu::mul_add")(match::used_once()).bind("mul_add"))); match::arg(0)(match::name("gpu::mul_add")(match::used_once()).bind("mul_add")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto mul_add_ins = r.instructions["mul_add"]; auto mul_add_ins = r.instructions["mul_add"];
auto ins = r.result; auto ins = r.result;
...@@ -559,7 +559,7 @@ struct find_mul_add_relu ...@@ -559,7 +559,7 @@ struct find_mul_add_relu
// Use the allocation from the relu operator // Use the allocation from the relu operator
args.back() = ins->inputs().back(); args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_mul_add_relu{}, args); m.replace_instruction(ins, hip_mul_add_relu{}, args);
} }
}; };
...@@ -682,7 +682,7 @@ struct miopen_fusion ...@@ -682,7 +682,7 @@ struct miopen_fusion
struct miopen_conv_bias struct miopen_conv_bias
{ {
op::convolution op; op::convolution op;
fusion f = {}; fusion fp = {};
fusion::op_t conv = {}; fusion::op_t conv = {};
fusion::op_t bias = {}; fusion::op_t bias = {};
...@@ -706,19 +706,19 @@ struct miopen_conv_bias ...@@ -706,19 +706,19 @@ struct miopen_conv_bias
float beta = 0; float beta = 0;
miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit()); miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit()); miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
return f.execute(ctx, fargs, args[0], args[4]); return fp.execute(ctx, fargs, args[0], args[4]);
} }
void finalize(context& ctx, const shape&, const std::vector<shape>& inputs) void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
{ {
f = fusion(inputs[0]); fp = fusion(inputs[0]);
conv = f.create_conv(op, inputs[1]); conv = fp.create_conv(op, inputs[1]);
bias = f.create_bias(inputs[3]); bias = fp.create_bias(inputs[3]);
if(not f.compile(ctx)) if(not fp.compile(ctx))
MIGRAPHX_THROW("Failed to compile fusion plan"); MIGRAPHX_THROW("Failed to compile fusion plan");
} }
shape get_workspace(context& ctx) { return f.get_workspace(ctx); } shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{ {
return shapes.size() - 1; return shapes.size() - 1;
...@@ -729,7 +729,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias) ...@@ -729,7 +729,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias)
struct miopen_conv_bias_relu struct miopen_conv_bias_relu
{ {
op::convolution op; op::convolution op;
fusion f = {}; fusion fp = {};
fusion::op_t conv = {}; fusion::op_t conv = {};
fusion::op_t bias = {}; fusion::op_t bias = {};
fusion::op_t relu = {}; fusion::op_t relu = {};
...@@ -755,18 +755,18 @@ struct miopen_conv_bias_relu ...@@ -755,18 +755,18 @@ struct miopen_conv_bias_relu
miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit()); miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit()); miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0); miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0);
return f.execute(ctx, fargs, args[0], args[4]); return fp.execute(ctx, fargs, args[0], args[4]);
} }
void finalize(context& ctx, const shape&, const std::vector<shape>& inputs) void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
{ {
f = fusion(inputs[0]); fp = fusion(inputs[0]);
conv = f.create_conv(op, inputs[1]); conv = fp.create_conv(op, inputs[1]);
bias = f.create_bias(inputs[3]); bias = fp.create_bias(inputs[3]);
relu = f.create_relu(); relu = fp.create_relu();
f.compile(ctx); fp.compile(ctx);
} }
shape get_workspace(context& ctx) { return f.get_workspace(ctx); } shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{ {
return shapes.size() - 1; return shapes.size() - 1;
...@@ -784,7 +784,7 @@ auto conv_bias(Ms... ms) ...@@ -784,7 +784,7 @@ auto conv_bias(Ms... ms)
} }
template <class Op> template <class Op>
void apply_conv_bias(context& ctx, module& p, match::matcher_result r) void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
{ {
auto conv_ins = r.instructions["conv"]; auto conv_ins = r.instructions["conv"];
auto bias_ins = r.instructions["bias"]; auto bias_ins = r.instructions["bias"];
...@@ -799,7 +799,7 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r) ...@@ -799,7 +799,7 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
// TODO: Insert ws allocation // TODO: Insert ws allocation
auto ws = cb.get_workspace(ctx); auto ws = cb.get_workspace(ctx);
(void)ws; (void)ws;
p.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins); m.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
} }
inline auto precompile_name(std::string s) // NOLINT inline auto precompile_name(std::string s) // NOLINT
...@@ -830,9 +830,9 @@ struct find_conv_bias ...@@ -830,9 +830,9 @@ struct find_conv_bias
match::output(match::name(std::unordered_set<std::string>{"gpu::relu"})))); match::output(match::name(std::unordered_set<std::string>{"gpu::relu"}))));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
apply_conv_bias<miopen_conv_bias>(*ctx, p, std::move(r)); apply_conv_bias<miopen_conv_bias>(*ctx, m, r);
} }
}; };
...@@ -841,9 +841,9 @@ struct find_conv_bias_relu ...@@ -841,9 +841,9 @@ struct find_conv_bias_relu
context* ctx = nullptr; context* ctx = nullptr;
auto matcher() const { return match::name("gpu::relu")(match::arg(0)(conv_bias())); } auto matcher() const { return match::name("gpu::relu")(match::arg(0)(conv_bias())); }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
apply_conv_bias<miopen_conv_bias_relu>(*ctx, p, std::move(r)); apply_conv_bias<miopen_conv_bias_relu>(*ctx, m, r);
} }
}; };
...@@ -858,7 +858,7 @@ struct find_conv_pointwise ...@@ -858,7 +858,7 @@ struct find_conv_pointwise
fusable_conv(match::used_once()).bind("conv"))); fusable_conv(match::used_once()).bind("conv")));
} }
void apply(module& m, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto conv_ins = r.instructions["conv"]; auto conv_ins = r.instructions["conv"];
auto bias_ins = r.instructions["bias"]; auto bias_ins = r.instructions["bias"];
...@@ -876,7 +876,6 @@ struct find_conv_pointwise ...@@ -876,7 +876,6 @@ struct find_conv_pointwise
{ {
if(i.name()[0] == '@') if(i.name()[0] == '@')
continue; continue;
auto inputs = to_shapes(i.inputs());
op.ops.push_back({{i.get_operator()}}); op.ops.push_back({{i.get_operator()}});
} }
std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins}; std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins};
...@@ -897,7 +896,7 @@ struct find_gemm_add ...@@ -897,7 +896,7 @@ struct find_gemm_add
match::name("gpu::gemm")(match::nargs(3)).bind("gemm"))); match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
} }
void apply(module& p, match::matcher_result r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto gemm_ins = r.instructions["gemm"]; auto gemm_ins = r.instructions["gemm"];
...@@ -909,26 +908,68 @@ struct find_gemm_add ...@@ -909,26 +908,68 @@ struct find_gemm_add
if(not float_equal(gemm.beta, 0)) if(not float_equal(gemm.beta, 0))
return; return;
if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto i) {
return not i->get_shape().standard();
}))
return;
auto inputs = gemm_ins->inputs(); auto inputs = gemm_ins->inputs();
inputs.pop_back(); inputs.pop_back();
auto copy_ins = c_ins; auto copy_ins = c_ins;
// Insert copy // Insert copy
if(ins == p.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty()) if(ins == m.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
{ {
copy_ins = p.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back()); copy_ins = m.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
} }
inputs.push_back(copy_ins); inputs.push_back(copy_ins);
inputs.push_back(copy_ins); inputs.push_back(copy_ins);
gemm.beta = 1; gemm.beta = 1;
p.replace_instruction(ins, gemm, inputs); m.replace_instruction(ins, gemm, inputs);
}
};
auto pointwise_name(const std::string& s)
{
return precompile_name("pointwise")(match::make_basic_pred_matcher([=](auto ins) {
module_ref pm = ins->module_inputs().front();
auto n = std::count_if(pm->begin(), pm->end(), [&](auto& i) { return i.name() == s; });
if(n != 1)
return false;
return std::all_of(pm->begin(), pm->end(), [&](auto& i) {
return starts_with(i.name(), "@") or i.name() == s;
});
}));
}
struct find_gemm_pointwise
{
auto matcher() const
{
return pointwise_name("add")(
match::nargs(3),
match::all_of[match::inputs()](match::standard_shape()),
match::either_arg(0, 1)(match::used_once().bind("c"),
match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto gemm_ins = r.instructions["gemm"];
auto c_ins = r.instructions["c"];
auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
// Already fused gemm
if(not float_equal(gemm.beta, 0))
return;
auto inputs = gemm_ins->inputs();
inputs.pop_back();
inputs.push_back(c_ins);
inputs.push_back(ins->inputs().back());
gemm.beta = 1;
m.replace_instruction(ins, gemm, inputs);
} }
}; };
...@@ -939,23 +980,23 @@ struct find_commutative_broadcast ...@@ -939,23 +980,23 @@ struct find_commutative_broadcast
return match::name("gpu::add", "gpu::mul")(match::arg(1)(match::broadcast_shape())); return match::name("gpu::add", "gpu::mul")(match::arg(1)(match::broadcast_shape()));
} }
void apply(module& p, const match::matcher_result& r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto args = ins->inputs(); auto args = ins->inputs();
move_broadcasted_back(args); move_broadcasted_back(args);
p.replace_instruction(ins, ins->get_operator(), args); m.replace_instruction(ins, ins->get_operator(), args);
} }
}; };
} // namespace } // namespace
void fuse_ops::apply(module& p) const void fuse_ops::apply(module& m) const
{ {
match::find_matches(p, find_gelu{}, find_gelu_new{fast_math}); match::find_matches(m, find_gelu{}, find_gelu_new{fast_math});
run_passes(p, {dead_code_elimination{}}); run_passes(m, {dead_code_elimination{}});
match::find_matches(p, find_triadd{}); match::find_matches(m, find_triadd{});
match::find_matches(p, match::find_matches(m,
find_layernorm{}, find_layernorm{},
find_conv_pointwise{ctx}, find_conv_pointwise{ctx},
find_conv_bias_relu{ctx}, find_conv_bias_relu{ctx},
...@@ -968,8 +1009,12 @@ void fuse_ops::apply(module& p) const ...@@ -968,8 +1009,12 @@ void fuse_ops::apply(module& p) const
find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}}, find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}},
find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}}, find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
find_add_clip{}); find_add_clip{});
run_passes(p, {dead_code_elimination{}}); run_passes(m, {dead_code_elimination{}});
match::find_matches(p, find_triadd_layernorm{}, find_gemm_add{}, find_commutative_broadcast{}); match::find_matches(m,
find_triadd_layernorm{},
find_gemm_add{},
find_gemm_pointwise{},
find_commutative_broadcast{});
} }
} // namespace gpu } // namespace gpu
......
#include <rocblas.h> #include <rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp> #include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
...@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type) ...@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type)
MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!"); MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
} }
void blas_shape(const shape& s)
{
if(s.lens().size() < 2)
return;
if(std::none_of(s.strides().end() - 2, s.strides().end(), [&](auto i) { return i == 1; }))
MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
if(s.lens().size() < 3)
return;
shape batch_shape{s.type(),
{s.lens().begin(), s.lens().end() - 2},
{s.strides().begin(), s.strides().end() - 2}};
auto batch_shapes = reduce_dims({batch_shape});
if(batch_shapes.front().lens().size() != 1)
MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
}
template <class R, class... Ts, class... Us> template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs) R rocblas_invoke(R (*f)(Ts...), Us... xs)
{ {
...@@ -36,6 +53,18 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs) ...@@ -36,6 +53,18 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
return f(xs..., nullptr, nullptr); return f(xs..., nullptr, nullptr);
} }
static bool is_transposed(const shape& s)
{
if(not s.transposed())
return false;
return s.strides().back() != 1;
}
static rocblas_int get_batch_stride(const argument& a)
{
return a.get_shape().strides()[a.get_shape().strides().size() - 3];
}
template <class T> template <class T>
void gemm_impl(context& ctx, void gemm_impl(context& ctx,
const shape& output_shape, const shape& output_shape,
...@@ -45,8 +74,8 @@ void gemm_impl(context& ctx, ...@@ -45,8 +74,8 @@ void gemm_impl(context& ctx,
bool int8_x4_format, bool int8_x4_format,
bool compute_fp32) bool compute_fp32)
{ {
bool transa = args[0].get_shape().transposed(); bool transa = is_transposed(args[0].get_shape());
bool transb = args[1].get_shape().transposed(); bool transb = is_transposed(args[1].get_shape());
auto n_dim = output_shape.lens().size(); auto n_dim = output_shape.lens().size();
auto dim_1 = n_dim - 1; auto dim_1 = n_dim - 1;
auto dim_0 = n_dim - 2; auto dim_0 = n_dim - 2;
...@@ -142,6 +171,9 @@ void gemm_impl(context& ctx, ...@@ -142,6 +171,9 @@ void gemm_impl(context& ctx,
} }
else else
{ {
auto a_stride = get_batch_stride(args[0]);
auto b_stride = get_batch_stride(args[1]);
auto c_stride = get_batch_stride(args[2]);
rocblas_invoke(&rocblas_gemm_strided_batched_ex, rocblas_invoke(&rocblas_gemm_strided_batched_ex,
ctx.get_stream().get_rocblas(), ctx.get_stream().get_rocblas(),
transb ? rocblas_operation_transpose : rocblas_operation_none, transb ? rocblas_operation_transpose : rocblas_operation_none,
...@@ -153,20 +185,20 @@ void gemm_impl(context& ctx, ...@@ -153,20 +185,20 @@ void gemm_impl(context& ctx,
to_pointer(args.at(1)), to_pointer(args.at(1)),
arg_type, arg_type,
ldb, ldb,
k * n, b_stride,
to_pointer(args.at(0)), to_pointer(args.at(0)),
arg_type, arg_type,
lda, lda,
m * k, a_stride,
beta_v, beta_v,
to_pointer(args[2]), to_pointer(args[2]),
output_type, output_type,
ldc, ldc,
m * n, c_stride,
is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]), is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
output_type, output_type,
ldc, ldc,
m * n, c_stride,
num_matrices, num_matrices,
compute_type, compute_type,
rocblas_gemm_algo_standard, rocblas_gemm_algo_standard,
......
...@@ -27,6 +27,15 @@ using hip_host_ptr = MIGRAPHX_MANAGE_PTR(void, hipHostUnregister); ...@@ -27,6 +27,15 @@ using hip_host_ptr = MIGRAPHX_MANAGE_PTR(void, hipHostUnregister);
std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); } std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
bool is_device_ptr(const void* ptr)
{
hipPointerAttribute_t attr;
auto status = hipPointerGetAttributes(&attr, ptr);
if(status != hipSuccess)
return false;
return attr.memoryType == hipMemoryTypeDevice;
}
std::size_t get_available_gpu_memory() std::size_t get_available_gpu_memory()
{ {
size_t free; size_t free;
...@@ -50,8 +59,8 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false) ...@@ -50,8 +59,8 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
{ {
if(sz > get_available_gpu_memory()) if(sz > get_available_gpu_memory())
MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz)); MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz));
void* result; void* result = nullptr;
auto status = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz); auto status = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz);
if(status != hipSuccess) if(status != hipSuccess)
{ {
if(host) if(host)
...@@ -59,6 +68,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false) ...@@ -59,6 +68,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
else else
return allocate_gpu(sz, true); return allocate_gpu(sz, true);
} }
assert(result != nullptr);
return hip_ptr{result}; return hip_ptr{result};
} }
...@@ -75,6 +85,8 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz) ...@@ -75,6 +85,8 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
{ {
gpu_sync(); gpu_sync();
std::vector<T> result(sz); std::vector<T> result(sz);
assert(not is_device_ptr(result.data()));
assert(is_device_ptr(x));
auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost); auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
if(status != hipSuccess) if(status != hipSuccess)
MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT
...@@ -85,6 +97,8 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false) ...@@ -85,6 +97,8 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
{ {
gpu_sync(); gpu_sync();
auto result = allocate_gpu(sz, host); auto result = allocate_gpu(sz, host);
assert(is_device_ptr(result.get()));
assert(not is_device_ptr(x));
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice); auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess) if(status != hipSuccess)
MIGRAPHX_THROW("Copy to gpu failed: " + hip_error(status)); MIGRAPHX_THROW("Copy to gpu failed: " + hip_error(status));
......
...@@ -11,7 +11,7 @@ struct module; ...@@ -11,7 +11,7 @@ struct module;
namespace gpu { namespace gpu {
std::vector<stream_race> analyze_streams(const module& p); std::vector<stream_race> analyze_streams(const module& m);
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
...@@ -35,6 +35,10 @@ struct code_object_op ...@@ -35,6 +35,10 @@ struct code_object_op
f(self.output, "output")); f(self.output, "output"));
} }
value attributes() const { return {{"group", group()}}; }
std::string group() const { return "gpu::code_object::" + symbol_name; }
std::string name() const { return "gpu::code_object"; } std::string name() const { return "gpu::code_object"; }
shape compute_shape(std::vector<shape> inputs) const; shape compute_shape(std::vector<shape> inputs) const;
argument argument
......
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#include <migraphx/config.hpp>
#include <string>
#include <unordered_map>
#include <vector>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct shape;
namespace gpu {
namespace gen {
struct vectorize
{
std::size_t size = 1;
std::size_t axis = 0;
static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
std::string str() const;
};
struct preload
{
std::vector<bool> args = {};
static preload broadcasts(std::size_t axis, const std::vector<shape>& inputs);
bool is_preloading() const;
std::string str() const;
};
std::size_t find_fast_axis(const std::vector<shape>& inputs);
std::string make_transformer_args(std::vector<std::string> transformers);
template <class... Ts>
std::string make_transformer_args(Ts... xs)
{
return make_transformer_args({xs.str()...});
}
} // namespace gen
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
...@@ -31,6 +31,13 @@ struct hip_compile_options ...@@ -31,6 +31,13 @@ struct hip_compile_options
void set_launch_params(const value& v, void set_launch_params(const value& v,
const std::function<std::size_t(std::size_t local)>& compute_global, const std::function<std::size_t(std::size_t local)>& compute_global,
std::size_t default_local = 1024); std::size_t default_local = 1024);
void
set_launch_params(const value& v, std::size_t default_global, std::size_t default_local = 1024)
{
set_launch_params(
v, [=](auto) { return default_global; }, default_local);
}
}; };
/// Compute global for n elements, but max out on target-specific upper limit /// Compute global for n elements, but max out on target-specific upper limit
...@@ -39,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1); ...@@ -39,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
operation compile_hip_code_object(const std::string& content, hip_compile_options options); operation compile_hip_code_object(const std::string& content, hip_compile_options options);
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
......
...@@ -14,7 +14,7 @@ namespace gpu { ...@@ -14,7 +14,7 @@ namespace gpu {
struct eliminate_workspace struct eliminate_workspace
{ {
std::string name() const { return "eliminate_workspace"; } std::string name() const { return "eliminate_workspace"; }
void apply(module& p) const; void apply(module& m) const;
}; };
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
...@@ -16,7 +16,7 @@ struct fuse_ops ...@@ -16,7 +16,7 @@ struct fuse_ops
context* ctx = nullptr; context* ctx = nullptr;
bool fast_math = true; bool fast_math = true;
std::string name() const { return "gpu::fuse_ops"; } std::string name() const { return "gpu::fuse_ops"; }
void apply(module& p) const; void apply(module& m) const;
}; };
} // namespace gpu } // namespace gpu
......
...@@ -18,6 +18,8 @@ namespace gpu { ...@@ -18,6 +18,8 @@ namespace gpu {
struct context; struct context;
void blas_shape(const shape& s);
template <class Op> template <class Op>
struct rocblas_gemm struct rocblas_gemm
{ {
...@@ -50,13 +52,14 @@ struct rocblas_gemm ...@@ -50,13 +52,14 @@ struct rocblas_gemm
std::vector<shape> in_shapes(inputs); std::vector<shape> in_shapes(inputs);
in_shapes.pop_back(); in_shapes.pop_back();
check_shapes{in_shapes, *this}.not_broadcasted(); check_shapes{in_shapes, *this}.not_broadcasted();
batch_not_transposed(inputs[0].strides()); blas_shape(inputs[0]);
batch_not_transposed(inputs[1].strides()); blas_shape(inputs[1]);
// if gemm and add are fused // if gemm and add are fused
if(not float_equal(beta, 0)) if(in_shapes.size() > 2)
{ {
auto cmat_shape = in_shapes.back(); auto cmat_shape = in_shapes.back();
in_shapes.pop_back(); in_shapes.pop_back();
blas_shape(cmat_shape);
auto op_out_shape = op.compute_shape(in_shapes); auto op_out_shape = op.compute_shape(in_shapes);
if(cmat_shape.lens() != op_out_shape.lens()) if(cmat_shape.lens() != op_out_shape.lens())
{ {
...@@ -71,6 +74,7 @@ struct rocblas_gemm ...@@ -71,6 +74,7 @@ struct rocblas_gemm
to_string(cmat_shape.type()) + to_string(cmat_shape.type()) +
", it must be: " + to_string(op_out_shape.type())); ", it must be: " + to_string(op_out_shape.type()));
} }
return op_out_shape;
} }
return op.compute_shape(in_shapes); return op.compute_shape(in_shapes);
...@@ -96,28 +100,6 @@ struct rocblas_gemm ...@@ -96,28 +100,6 @@ struct rocblas_gemm
return args.back(); return args.back();
} }
void batch_not_transposed(const std::vector<std::size_t>& strides) const
{
if(strides.size() <= 2)
return;
auto dim_0 = strides.size() - 2;
auto matrix_size = std::max(strides[dim_0], strides[dim_0 + 1]);
std::vector<std::size_t> batch(strides.begin(), strides.begin() + dim_0);
if(std::all_of(batch.begin(), batch.end(), [&](auto i) { return (i < matrix_size); }))
{
MIGRAPHX_THROW("GPU_GEMM: matrix size and batch size {" + to_string_range(strides) +
"} are transposed!");
}
if(std::adjacent_find(batch.begin(), batch.end(), [&](auto i, auto j) {
return (i < j or i < matrix_size or j < matrix_size);
}) != batch.end())
{
MIGRAPHX_THROW("GPU_GEMM: batch size {" + to_string_range(strides) +
"} is transposed!");
}
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{ {
return shapes.size() - 1; return shapes.size() - 1;
......
#ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#include <migraphx/config.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct module;
namespace gpu {
struct prefuse_ops
{
std::string name() const { return "gpu::prefuse_ops"; }
void apply(module& m) const;
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP #define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
#include <migraphx/shape.hpp> #include <migraphx/shape.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/op/quant_convolution.hpp> #include <migraphx/op/quant_convolution.hpp>
#include <migraphx/gpu/miopen.hpp> #include <migraphx/gpu/miopen.hpp>
...@@ -14,6 +15,7 @@ struct context; ...@@ -14,6 +15,7 @@ struct context;
struct miopen_quant_convolution struct miopen_quant_convolution
{ {
op::quant_convolution op; op::quant_convolution op;
bool int8_x4_format = false;
shared<convolution_descriptor> cd; shared<convolution_descriptor> cd;
miopenConvFwdAlgorithm_t algo{}; miopenConvFwdAlgorithm_t algo{};
miopenHandle_t handle = nullptr; miopenHandle_t handle = nullptr;
...@@ -22,7 +24,8 @@ struct miopen_quant_convolution ...@@ -22,7 +24,8 @@ struct miopen_quant_convolution
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
{ {
// TODO: Add algo // TODO: Add algo
return op::quant_convolution::reflect(self.op, f); return pack_join(migraphx::reflect(self.op, f),
pack(f(self.int8_x4_format, "int8_x4_format")));
} }
std::string name() const { return "gpu::quant_convolution"; } std::string name() const { return "gpu::quant_convolution"; }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <migraphx/argument.hpp> #include <migraphx/argument.hpp>
#include <migraphx/reflect.hpp> #include <migraphx/reflect.hpp>
#include <migraphx/op/scatter.hpp> #include <migraphx/op/scatter_none.hpp>
#include <migraphx/gpu/miopen.hpp> #include <migraphx/gpu/miopen.hpp>
namespace migraphx { namespace migraphx {
...@@ -14,7 +14,9 @@ struct context; ...@@ -14,7 +14,9 @@ struct context;
struct hip_scatter struct hip_scatter
{ {
op::scatter op; // scatter_none is an exact replacement for previous op::scatter,
// renamed to match an Onnx option. Don't use base class op::scatter
op::scatter_none op;
template <class Self, class F> template <class Self, class F>
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
...@@ -22,7 +24,7 @@ struct hip_scatter ...@@ -22,7 +24,7 @@ struct hip_scatter
return migraphx::reflect(self.op, f); return migraphx::reflect(self.op, f);
} }
std::string name() const { return "gpu::scatter"; } std::string name() const { return "gpu::scatter_none"; }
shape compute_shape(std::vector<shape> inputs) const; shape compute_shape(std::vector<shape> inputs) const;
argument argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const; compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
......
...@@ -17,9 +17,9 @@ struct schedule_model ...@@ -17,9 +17,9 @@ struct schedule_model
{ {
std::size_t streams = 0; std::size_t streams = 0;
std::size_t concurrency() const; std::size_t concurrency() const;
void sched(module& p, instruction_ref ins, std::size_t n) const; void sched(module& m, instruction_ref ins, std::size_t n) const;
void wait(module& p, instruction_ref ins, std::size_t wait_id) const; void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
void record(module& p, instruction_ref ins, std::size_t wait_id) const; void record(module& m, instruction_ref ins, std::size_t wait_id) const;
std::size_t weight(const operation& op) const; std::size_t weight(const operation& op) const;
}; };
......
...@@ -15,7 +15,7 @@ namespace gpu { ...@@ -15,7 +15,7 @@ namespace gpu {
struct sync_device struct sync_device
{ {
std::string name() const { return "sync_device"; } std::string name() const { return "sync_device"; }
void apply(module& p) const; void apply(module& m) const;
}; };
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
...@@ -14,7 +14,7 @@ struct write_literals ...@@ -14,7 +14,7 @@ struct write_literals
context* ctx = nullptr; context* ctx = nullptr;
std::string name() const { return "gpu::write_literals"; } std::string name() const { return "gpu::write_literals"; }
void apply(module& p) const; void apply(module& m) const;
}; };
} // namespace gpu } // namespace gpu
......
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/module.hpp>
#include <migraphx/pass_manager.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
// NOLINTNEXTLINE
static const char* const gathernd_kernel = R"__migraphx__(
#include <migraphx/kernels/gathernd.hpp>
#include <migraphx/kernels/ops.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/generic_constant.hpp>
#include <args.hpp>
namespace migraphx {
extern "C" {
__global__ void gathernd_kernel(void* in_data, void* in_indices, void* output)
{
make_tensors()(in_data, in_indices, output)([](auto&&... xs) {
auto settings = make_gathernd_settings(MIGRAPHX_MAKE_CONSTANT(int64_t{BATCH_DIMS}));
gathernd(xs..., settings);
});
}
}
} // namespace migraphx
)__migraphx__";
struct gathernd_compiler : compiler<gathernd_compiler>
{
std::vector<std::string> names() const { return {"gathernd"}; }
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{
hip_compile_options options;
auto out_s = inputs.back();
options.set_launch_params(v, compute_global_for(ctx, out_s.elements()));
options.inputs = inputs;
options.output = out_s;
options.kernel_name = "gathernd_kernel";
options.virtual_inputs = inputs;
// batch_dims
assert(v.contains("batch_dims"));
auto batch_dims = v.at("batch_dims").to<int64_t>();
options.params += " -DBATCH_DIMS=" + std::to_string(batch_dims);
return compile_hip_code_object(gathernd_kernel, options);
}
compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
{
return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
}
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment