Commit 7e297b13 authored by Paul's avatar Paul
Browse files

Merge

parents 86ea5e91 aa7ff911
#include <migraphx/gpu/driver/parser.hpp>
#include <migraphx/gpu/driver/action.hpp>
#include <iostream>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
......@@ -17,8 +17,10 @@ struct run_op : action<run_op>
auto name = v.at("name").to<std::string>();
if(not contains(name, "::"))
name = "gpu::" + name;
auto op = make_op(name);
double t = time_op(ctx, op, inputs);
auto op = make_op(name);
if(v.contains("fields"))
op.from_value(v.at("fields"));
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl;
}
};
......
......@@ -11,11 +11,11 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
void eliminate_workspace::apply(module& p) const
void eliminate_workspace::apply(module& m) const
{
std::size_t n = 0;
std::vector<instruction_ref> allocs;
for(auto ins : iterator_for(p))
for(auto ins : iterator_for(m))
{
if(ins->outputs().size() != 1)
continue;
......@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const
}
if(n > 0)
{
auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}});
auto ws = m.add_parameter("workspace", shape{shape::int8_type, {n}});
for(auto&& a : allocs)
{
p.replace_instruction(a, ws);
p.remove_instruction(a);
m.replace_instruction(a, ws);
m.remove_instruction(a);
}
}
}
......
......@@ -5,6 +5,7 @@
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/clip.hpp>
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/add.hpp>
#include <migraphx/gpu/mul.hpp>
......@@ -26,6 +27,7 @@
#include <migraphx/array.hpp>
#include <migraphx/op/clip.hpp>
#include <cmath>
#include <set>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -60,6 +62,8 @@ struct fusion
keep_alive(std::move(t));
}
bool empty() const { return fp == nullptr; }
op_t operator[](std::size_t i) const
{
assert(fp);
......@@ -123,12 +127,11 @@ struct fusion
return shape{shape::int8_type, {ws_size}};
}
void compile(context& ctx)
bool compile(context& ctx)
{
assert(fp);
auto status = miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get());
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("Compiling fusion plan failed");
return miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get()) ==
miopenStatusSuccess;
}
argument execute(context& ctx,
......@@ -152,6 +155,12 @@ struct fusion
}
};
const std::unordered_set<std::string>& get_supported_archs()
{
static std::unordered_set<std::string> supported_archs{"gfx900", "gfx906", "gfx908", "gfx1030"};
return supported_archs;
}
MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
{
auto&& s = ins->get_shape();
......@@ -161,6 +170,9 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
{
const auto device_name = trim(split_string(get_device_name(), ':').front());
if(not contains(get_supported_archs(), device_name))
return false;
if(enabled(MIGRAPHX_DISABLE_MIOPEN_FUSION{}))
return false;
if(ins->name() != "gpu::convolution")
......@@ -304,7 +316,7 @@ struct find_layernorm
{
auto matcher() const { return match::layernorm(&gpu_name); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
......@@ -319,7 +331,7 @@ struct find_layernorm
if(relements > 1024 or (relements % 4 != 0 and relements > 256))
return;
p.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
m.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
}
};
......@@ -331,11 +343,11 @@ struct find_triadd_layernorm
match::used_once(), match::all_of[match::inputs()](match::standard_shape()))));
}
void apply(module& p, const match::matcher_result& r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto triadd = ins->inputs().front();
p.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
m.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
}
};
......@@ -343,13 +355,13 @@ struct find_gelu
{
auto matcher() const { return match::gelu_erf(&gpu_name); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
p.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
}
};
......@@ -360,7 +372,7 @@ struct find_add_gelu
return match::name("gpu::gelu")(match::arg(0)(match::name("gpu::add").bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -369,7 +381,7 @@ struct find_add_gelu
move_broadcasted_back(args);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_add_gelu{}, args);
m.replace_instruction(ins, hip_add_gelu{}, args);
}
};
......@@ -379,16 +391,16 @@ struct find_gelu_new
auto matcher() const { return match::gelu_tanh(&gpu_name); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
if(fast_math)
p.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
else
p.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
m.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
}
};
......@@ -399,7 +411,7 @@ struct find_add_gelu_new
return match::name("gpu::gelu_new")(match::arg(0)(match::name("gpu::add").bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -408,7 +420,7 @@ struct find_add_gelu_new
move_broadcasted_back(args);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_add_gelu_new{}, args);
m.replace_instruction(ins, hip_add_gelu_new{}, args);
}
};
......@@ -423,7 +435,7 @@ struct find_add_clip
.bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -436,9 +448,9 @@ struct find_add_clip
add_args.pop_back();
add_args.insert(add_args.end(), std::next(ins_args.begin()), ins_args.end());
if(add_ins->name() == "gpu::add")
p.replace_instruction(ins, hip_add_clip{}, add_args);
m.replace_instruction(ins, hip_add_clip{}, add_args);
else if(add_ins->name() == "gpu::triadd")
p.replace_instruction(ins, hip_triadd_clip{}, add_args);
m.replace_instruction(ins, hip_triadd_clip{}, add_args);
}
};
......@@ -458,7 +470,7 @@ struct find_add_unary
.bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -469,9 +481,9 @@ struct find_add_unary
// Use the allocation from the relu operator
args.back() = ins->inputs().back();
if(add_ins->name() == "gpu::add")
p.replace_instruction(ins, binary_add_op, args);
m.replace_instruction(ins, binary_add_op, args);
else if(add_ins->name() == "gpu::triadd")
p.replace_instruction(ins, ternary_add_op, args);
m.replace_instruction(ins, ternary_add_op, args);
}
};
......@@ -486,7 +498,7 @@ struct find_triadd
.bind("input")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto input_ins = r.instructions["input"];
......@@ -501,7 +513,7 @@ struct find_triadd
move_broadcasted_back(args);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_triadd{}, args);
m.replace_instruction(ins, hip_triadd{}, args);
}
};
......@@ -513,7 +525,7 @@ struct find_mul_add
match::name("gpu::mul")(match::used_once()).bind("mul"), match::any().bind("b")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto mul_ins = r.instructions["mul"];
auto b_ins = r.instructions["b"];
......@@ -526,7 +538,7 @@ struct find_mul_add
args.insert(std::prev(args.end()), b_ins);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_mul_add{}, args);
m.replace_instruction(ins, hip_mul_add{}, args);
}
};
......@@ -538,7 +550,7 @@ struct find_mul_add_relu
match::arg(0)(match::name("gpu::mul_add")(match::used_once()).bind("mul_add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto mul_add_ins = r.instructions["mul_add"];
auto ins = r.result;
......@@ -546,14 +558,130 @@ struct find_mul_add_relu
// Use the allocation from the relu operator
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_mul_add_relu{}, args);
m.replace_instruction(ins, hip_mul_add_relu{}, args);
}
};
struct miopen_fusion
{
struct fuse_op_data
{
operation op;
float alpha = 1;
float beta = 0;
};
struct fuse_op : fuse_op_data, reflect_equality<fuse_op>, reflect_stream<fuse_op>
{
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.op, "op"), f(self.alpha, "alpha"), f(self.beta, "beta"));
}
};
std::vector<fuse_op> ops = {};
fusion f = {};
std::function<void(context&, const fusion&, const std::vector<argument>&)> execute;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.ops, "ops"));
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
value compile(context& ctx, const shape&, std::vector<shape> inputs)
{
// Compensate for allocation
inputs.pop_back();
std::size_t i = 0;
f = fusion(inputs[i]);
i++;
std::vector<std::function<void(const fused_operator_args&, const std::vector<argument>&)>>
invokers;
for(auto&& fop : ops)
{
if(i > inputs.size())
{
f = {};
return {};
}
if(fop.op.name() == "convolution")
{
auto* mop = f.create_conv(any_cast<op::convolution>(fop.op), inputs[i]);
invokers.push_back(
[=](const fused_operator_args& fargs, const std::vector<argument>& args) {
miopenSetOpArgsConvForward(
fargs.get(), mop, &fop.alpha, &fop.beta, args[i].implicit());
});
i++;
}
else if(fop.op.name() == "add")
{
auto* mop = f.create_bias(inputs[i]);
invokers.push_back(
[=](const fused_operator_args& fargs, const std::vector<argument>& args) {
miopenSetOpArgsBiasForward(
fargs.get(), mop, &fop.alpha, &fop.beta, args[i].implicit());
});
i++;
}
else if(fop.op.name() == "relu")
{
auto* mop = f.create_relu();
invokers.push_back([=](const fused_operator_args& fargs,
const std::vector<argument>&) {
miopenSetOpArgsActivForward(fargs.get(), mop, &fop.alpha, &fop.beta, 0, 0, 0);
});
}
else
{
f = {};
return {};
}
}
if(not f.compile(ctx))
{
f = {};
return {};
}
execute = [invokers](context& c, const fusion& ff, const std::vector<argument>& args) {
auto fargs = make_fused_args();
for(auto&& invoker : invokers)
invoker(fargs, args);
ff.execute(c, fargs, args.front(), args.back());
};
return {{"workspace", f.get_workspace(ctx).bytes()}};
}
void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
{
if(not f.empty())
return;
auto v = compile(ctx, output_shape, inputs);
if(not v.is_object())
MIGRAPHX_THROW("Failed to compile fusion plan");
}
std::string name() const { return "gpu::miopen_fusion"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
if(ops.empty())
return {};
// TODO: Check number of arguments
return ops.front().op.compute_shape({inputs[0], inputs[1]});
}
argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
execute(ctx, f, args);
return args.back();
}
};
struct miopen_conv_bias
{
op::convolution op;
fusion f = {};
fusion fp = {};
fusion::op_t conv = {};
fusion::op_t bias = {};
......@@ -577,18 +705,19 @@ struct miopen_conv_bias
float beta = 0;
miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
return f.execute(ctx, fargs, args[0], args[4]);
return fp.execute(ctx, fargs, args[0], args[4]);
}
void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
{
f = fusion(inputs[0]);
conv = f.create_conv(op, inputs[1]);
bias = f.create_bias(inputs[3]);
f.compile(ctx);
fp = fusion(inputs[0]);
conv = fp.create_conv(op, inputs[1]);
bias = fp.create_bias(inputs[3]);
if(not fp.compile(ctx))
MIGRAPHX_THROW("Failed to compile fusion plan");
}
shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
......@@ -599,7 +728,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias)
struct miopen_conv_bias_relu
{
op::convolution op;
fusion f = {};
fusion fp = {};
fusion::op_t conv = {};
fusion::op_t bias = {};
fusion::op_t relu = {};
......@@ -625,18 +754,18 @@ struct miopen_conv_bias_relu
miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0);
return f.execute(ctx, fargs, args[0], args[4]);
return fp.execute(ctx, fargs, args[0], args[4]);
}
void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
{
f = fusion(inputs[0]);
conv = f.create_conv(op, inputs[1]);
bias = f.create_bias(inputs[3]);
relu = f.create_relu();
f.compile(ctx);
fp = fusion(inputs[0]);
conv = fp.create_conv(op, inputs[1]);
bias = fp.create_bias(inputs[3]);
relu = fp.create_relu();
fp.compile(ctx);
}
shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
......@@ -654,7 +783,7 @@ auto conv_bias(Ms... ms)
}
template <class Op>
void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
{
auto conv_ins = r.instructions["conv"];
auto bias_ins = r.instructions["bias"];
......@@ -669,7 +798,26 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
// TODO: Insert ws allocation
auto ws = cb.get_workspace(ctx);
(void)ws;
p.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
m.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
}
inline auto precompile_name(std::string s) // NOLINT
{
return match::make_basic_pred_matcher([=](instruction_ref ins) {
if(ins->name() != "gpu::precompile_op")
return false;
auto op = from_value<operation>(ins->get_operator().to_value().at("op"));
return (op.name() == s);
});
}
template <class... Ms>
auto conv_bias_pointwise(Ms... ms)
{
return precompile_name("pointwise")(
match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
fusable_conv(match::used_once()).bind("conv")),
ms...);
}
struct find_conv_bias
......@@ -681,9 +829,9 @@ struct find_conv_bias
match::output(match::name(std::unordered_set<std::string>{"gpu::relu"}))));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
apply_conv_bias<miopen_conv_bias>(*ctx, p, std::move(r));
apply_conv_bias<miopen_conv_bias>(*ctx, m, r);
}
};
......@@ -692,9 +840,48 @@ struct find_conv_bias_relu
context* ctx = nullptr;
auto matcher() const { return match::name("gpu::relu")(match::arg(0)(conv_bias())); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
apply_conv_bias<miopen_conv_bias_relu>(*ctx, m, r);
}
};
struct find_conv_pointwise
{
context* ctx = nullptr;
auto matcher() const
{
return precompile_name("pointwise")(
match::nargs(3),
match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
fusable_conv(match::used_once()).bind("conv")));
}
void apply(module& m, const match::matcher_result& r) const
{
apply_conv_bias<miopen_conv_bias_relu>(*ctx, p, std::move(r));
auto conv_ins = r.instructions["conv"];
auto bias_ins = r.instructions["bias"];
auto ins = r.result;
auto input_ins = conv_ins->inputs().at(0);
auto weights_ins = conv_ins->inputs().at(1);
auto conv_op = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
auto alloc_ins = ins->inputs().back();
module_ref pm = ins->module_inputs().front();
miopen_fusion op{};
op.ops.push_back({{conv_op}});
for(auto&& i : *pm)
{
if(i.name()[0] == '@')
continue;
op.ops.push_back({{i.get_operator()}});
}
std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins};
auto v = op.compile(*ctx, ins->get_shape(), to_shapes(inputs));
if(not v.is_object())
return;
m.replace_instruction(ins, op, inputs);
}
};
......@@ -708,7 +895,7 @@ struct find_gemm_add
match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto gemm_ins = r.instructions["gemm"];
......@@ -717,12 +904,7 @@ struct find_gemm_add
auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
// Already fused gemm
if(not float_equal(gemm.op.beta, 0))
return;
if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto i) {
return not i->get_shape().standard();
}))
if(not float_equal(gemm.beta, 0))
return;
auto inputs = gemm_ins->inputs();
......@@ -731,15 +913,62 @@ struct find_gemm_add
auto copy_ins = c_ins;
// Insert copy
if(ins == p.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
if(ins == m.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
{
copy_ins = p.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
copy_ins = m.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
}
inputs.push_back(copy_ins);
inputs.push_back(copy_ins);
gemm.op.beta = 1;
p.replace_instruction(ins, gemm, inputs);
gemm.beta = 1;
m.replace_instruction(ins, gemm, inputs);
}
};
auto pointwise_name(const std::string& s)
{
return precompile_name("pointwise")(match::make_basic_pred_matcher([=](auto ins) {
module_ref pm = ins->module_inputs().front();
auto n = std::count_if(pm->begin(), pm->end(), [&](auto& i) { return i.name() == s; });
if(n != 1)
return false;
return std::all_of(pm->begin(), pm->end(), [&](auto& i) {
return starts_with(i.name(), "@") or i.name() == s;
});
}));
}
struct find_gemm_pointwise
{
auto matcher() const
{
return pointwise_name("add")(
match::nargs(3),
match::all_of[match::inputs()](match::standard_shape()),
match::either_arg(0, 1)(match::used_once().bind("c"),
match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto gemm_ins = r.instructions["gemm"];
auto c_ins = r.instructions["c"];
auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
// Already fused gemm
if(not float_equal(gemm.beta, 0))
return;
auto inputs = gemm_ins->inputs();
inputs.pop_back();
inputs.push_back(c_ins);
inputs.push_back(ins->inputs().back());
gemm.beta = 1;
m.replace_instruction(ins, gemm, inputs);
}
};
......@@ -750,23 +979,24 @@ struct find_commutative_broadcast
return match::name("gpu::add", "gpu::mul")(match::arg(1)(match::broadcast_shape()));
}
void apply(module& p, const match::matcher_result& r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto args = ins->inputs();
move_broadcasted_back(args);
p.replace_instruction(ins, ins->get_operator(), args);
m.replace_instruction(ins, ins->get_operator(), args);
}
};
void fuse_ops::apply(module& p) const
void fuse_ops::apply(module& m) const
{
match::find_matches(p, find_gelu{}, find_gelu_new{fast_math});
run_passes(p, {dead_code_elimination{}});
match::find_matches(p, find_triadd{});
match::find_matches(p,
match::find_matches(m, find_gelu{}, find_gelu_new{fast_math});
run_passes(m, {dead_code_elimination{}});
match::find_matches(m, find_triadd{});
match::find_matches(m,
find_layernorm{},
find_conv_pointwise{ctx},
find_conv_bias_relu{ctx},
find_conv_bias{ctx},
find_add_gelu{},
......@@ -777,8 +1007,12 @@ void fuse_ops::apply(module& p) const
find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}},
find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
find_add_clip{});
run_passes(p, {dead_code_elimination{}});
match::find_matches(p, find_triadd_layernorm{}, find_gemm_add{}, find_commutative_broadcast{});
run_passes(m, {dead_code_elimination{}});
match::find_matches(m,
find_triadd_layernorm{},
find_gemm_add{},
find_gemm_pointwise{},
find_commutative_broadcast{});
}
} // namespace gpu
......
#include <rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type)
MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
}
void blas_shape(const shape& s)
{
if(s.lens().size() < 2)
return;
if(std::none_of(s.strides().end() - 2, s.strides().end(), [&](auto i) { return i == 1; }))
MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
if(s.lens().size() < 3)
return;
shape batch_shape{s.type(),
{s.lens().begin(), s.lens().end() - 2},
{s.strides().begin(), s.strides().end() - 2}};
auto batch_shapes = reduce_dims({batch_shape});
if(batch_shapes.front().lens().size() != 1)
MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
}
template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs)
{
......@@ -36,16 +53,29 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
return f(xs..., nullptr, nullptr);
}
static bool is_transposed(const shape& s)
{
if(not s.transposed())
return false;
return s.strides().back() != 1;
}
static rocblas_int get_batch_stride(const argument& a)
{
return a.get_shape().strides()[a.get_shape().strides().size() - 3];
}
template <class T>
void gemm_impl(context& ctx,
const shape& output_shape,
const std::vector<argument>& args,
T alpha,
T beta,
bool int8_x4_format)
bool int8_x4_format,
bool compute_fp32)
{
bool transa = args[0].get_shape().transposed();
bool transb = args[1].get_shape().transposed();
bool transa = is_transposed(args[0].get_shape());
bool transb = is_transposed(args[1].get_shape());
auto n_dim = output_shape.lens().size();
auto dim_1 = n_dim - 1;
auto dim_0 = n_dim - 2;
......@@ -65,6 +95,11 @@ void gemm_impl(context& ctx,
output_type = rocblas_datatype_i32_r;
}
auto compute_type = output_type;
if(compute_fp32)
{
if(arg_type == rocblas_datatype_f16_r)
compute_type = rocblas_datatype_f32_r;
}
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
rocblas_gemm_flags flag =
......@@ -77,8 +112,19 @@ void gemm_impl(context& ctx,
auto a_lens = args[0].get_shape().lens();
auto b_lens = args[1].get_shape().lens();
output_shape.visit_type([&](auto as) {
auto alpha_r = as(alpha);
auto beta_r = as(beta);
auto alpha_r = as(alpha);
auto beta_r = as(beta);
// use void pointer to select different data type if using fp32 mode
void* alpha_v = &alpha_r;
void* beta_v = &beta_r;
if(compute_fp32)
{
alpha_v = &alpha;
beta_v = &beta;
}
auto out_lens = output_shape.lens();
rocblas_int m = out_lens[dim_0];
rocblas_int n = out_lens[dim_1];
......@@ -104,14 +150,14 @@ void gemm_impl(context& ctx,
n,
m,
k,
&alpha_r,
alpha_v,
to_pointer(args.at(1)),
arg_type,
ldb,
to_pointer(args.at(0)),
arg_type,
lda,
&beta_r,
beta_v,
to_pointer(args[2]),
output_type,
ldc,
......@@ -125,6 +171,9 @@ void gemm_impl(context& ctx,
}
else
{
auto a_stride = get_batch_stride(args[0]);
auto b_stride = get_batch_stride(args[1]);
auto c_stride = get_batch_stride(args[2]);
rocblas_invoke(&rocblas_gemm_strided_batched_ex,
ctx.get_stream().get_rocblas(),
transb ? rocblas_operation_transpose : rocblas_operation_none,
......@@ -132,24 +181,24 @@ void gemm_impl(context& ctx,
n,
m,
k,
&alpha_r,
alpha_v,
to_pointer(args.at(1)),
arg_type,
ldb,
k * n,
b_stride,
to_pointer(args.at(0)),
arg_type,
lda,
m * k,
&beta_r,
a_stride,
beta_v,
to_pointer(args[2]),
output_type,
ldc,
m * n,
c_stride,
is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
output_type,
ldc,
m * n,
c_stride,
num_matrices,
compute_type,
rocblas_gemm_algo_standard,
......@@ -164,9 +213,10 @@ void gemm(context& ctx,
const std::vector<argument>& args,
float alpha,
float beta,
bool int8_x4_format)
bool int8_x4_format,
bool compute_fp32)
{
gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format);
gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
}
void gemm(context& ctx,
......@@ -174,9 +224,10 @@ void gemm(context& ctx,
const std::vector<argument>& args,
int32_t alpha,
int32_t beta,
bool int8_x4_format)
bool int8_x4_format,
bool compute_fp32)
{
gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format);
gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
}
} // namespace gpu
......
......@@ -27,6 +27,15 @@ using hip_host_ptr = MIGRAPHX_MANAGE_PTR(void, hipHostUnregister);
std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
bool is_device_ptr(const void* ptr)
{
hipPointerAttribute_t attr;
auto status = hipPointerGetAttributes(&attr, ptr);
if(status != hipSuccess)
return false;
return attr.memoryType == hipMemoryTypeDevice;
}
std::size_t get_available_gpu_memory()
{
size_t free;
......@@ -50,8 +59,8 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
{
if(sz > get_available_gpu_memory())
MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz));
void* result;
auto status = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz);
void* result = nullptr;
auto status = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz);
if(status != hipSuccess)
{
if(host)
......@@ -59,6 +68,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
else
return allocate_gpu(sz, true);
}
assert(result != nullptr);
return hip_ptr{result};
}
......@@ -75,6 +85,8 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
{
gpu_sync();
std::vector<T> result(sz);
assert(not is_device_ptr(result.data()));
assert(is_device_ptr(x));
auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
if(status != hipSuccess)
MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT
......@@ -85,6 +97,8 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
{
gpu_sync();
auto result = allocate_gpu(sz, host);
assert(is_device_ptr(result.get()));
assert(not is_device_ptr(x));
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess)
MIGRAPHX_THROW("Copy to gpu failed: " + hip_error(status));
......@@ -109,10 +123,9 @@ argument register_on_gpu(const argument& arg)
{
auto arg_shared = arg.share();
auto p = share(register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes()));
return {arg_shared.get_shape(),
[ p, a = std::move(arg_shared) ]() mutable {return get_device_ptr(p.get());
}
}; // namespace gpu
return {arg_shared.get_shape(), [p, a = std::move(arg_shared)]() mutable {
return get_device_ptr(p.get());
}}; // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
argument to_gpu(const argument& arg, bool host)
......@@ -169,12 +182,26 @@ void gpu_copy(context& ctx, const argument& src, const argument& dst)
void copy_to_gpu(context& ctx, const argument& src, const argument& dst)
{
gpu_copy(ctx, register_on_gpu(src), dst);
if(src.get_shape() == dst.get_shape() and dst.get_shape().packed())
{
hip_async_copy(ctx, src, dst, hipMemcpyHostToDevice);
}
else
{
gpu_copy(ctx, register_on_gpu(src), dst);
}
}
void copy_from_gpu(context& ctx, const argument& src, const argument& dst)
{
gpu_copy(ctx, src, register_on_gpu(dst));
if(src.get_shape() == dst.get_shape() and dst.get_shape().packed())
{
hip_async_copy(ctx, src, dst, hipMemcpyDeviceToHost);
}
else
{
gpu_copy(ctx, src, register_on_gpu(dst));
}
}
argument get_preallocation(context& ctx, const std::string& id)
......
......@@ -3,6 +3,7 @@
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
#include <migraphx/instruction_ref.hpp>
#include <string>
namespace migraphx {
......
......@@ -11,7 +11,7 @@ struct module;
namespace gpu {
std::vector<stream_race> analyze_streams(const module& p);
std::vector<stream_race> analyze_streams(const module& m);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -34,6 +34,10 @@ struct code_object_op
f(self.output, "output"));
}
value attributes() const { return {{"group", group()}}; }
std::string group() const { return "gpu::code_object::" + symbol_name; }
std::string name() const { return "gpu::code_object"; }
shape compute_shape(std::vector<shape> inputs) const;
argument
......
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#include <migraphx/config.hpp>
#include <string>
#include <unordered_map>
#include <vector>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct shape;
namespace gpu {
namespace gen {
struct vectorize
{
std::size_t size = 1;
std::size_t axis = 0;
static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
std::string str() const;
};
struct preload
{
std::vector<bool> args = {};
static preload broadcasts(std::size_t axis, const std::vector<shape>& inputs);
bool is_preloading() const;
std::string str() const;
};
std::size_t find_fast_axis(const std::vector<shape>& inputs);
std::string make_transformer_args(std::vector<std::string> transformers);
template <class... Ts>
std::string make_transformer_args(Ts... xs)
{
return make_transformer_args({xs.str()...});
}
} // namespace gen
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
......@@ -15,6 +15,8 @@ namespace gpu {
std::vector<std::vector<char>>
compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);
std::string enum_params(std::size_t count, std::string param);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
......@@ -8,6 +8,8 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct context;
struct hip_compile_options
{
std::size_t global;
......@@ -16,11 +18,36 @@ struct hip_compile_options
shape output;
std::string kernel_name = "kernel";
std::string params = "";
std::vector<shape> reduced_inputs = {};
std::vector<shape> virtual_inputs = {};
/**
* @brief Set the launch parameters but allow v to override the values
*
* @param v A value class which can have a "global" and/or "local" keys to override the default
* global and local
* @param compute_global A function used to compute the global based on the local
* @param default_local The defaul local to use if its missing from the v parameter
*/
void set_launch_params(const value& v,
const std::function<std::size_t(std::size_t local)>& compute_global,
std::size_t default_local = 1024);
void
set_launch_params(const value& v, std::size_t default_global, std::size_t default_local = 1024)
{
set_launch_params(
v, [=](auto) { return default_global; }, default_local);
}
};
/// Compute global for n elements, but max out on target-specific upper limit
std::function<std::size_t(std::size_t local)>
compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
operation compile_hip_code_object(const std::string& content, hip_compile_options options);
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
#include <string>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct module;
namespace gpu {
struct context;
operation
compile_pointwise(context& ctx, const std::vector<shape>& inputs, const std::string& lambda);
struct compile_ops
{
context* ctx = nullptr;
std::string name() const { return "gpu::compile_ops"; }
void apply(module& m) const;
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#endif // MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
#ifndef MIGRAPHX_GUARD_GPU_COMPILER_HPP
#define MIGRAPHX_GUARD_GPU_COMPILER_HPP
#include <migraphx/config.hpp>
#include <migraphx/auto_register.hpp>
#include <migraphx/operation.hpp>
#include <migraphx/value.hpp>
#include <migraphx/module.hpp>
#include <migraphx/instruction.hpp>
#include <functional>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct context;
using compiler_replace = std::function<void(module& m, instruction_ref ins)>;
using compiler_compile = std::function<compiler_replace(context&, instruction_ref, operation)>;
using compiler_compile_op =
std::function<operation(context&, const std::vector<shape>& inputs, const value&)>;
void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop);
bool has_compiler_for(const std::string& name);
compiler_replace compile(context& ctx, instruction_ref ins, const operation& op);
operation
compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v);
template <class T>
void register_compiler()
{
T c;
for(auto&& name : c.names())
{
register_compiler(
name,
[=](auto&&... xs) { return c.compile(std::forward<decltype(xs)>(xs)...); },
[=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); });
}
}
struct register_compiler_action
{
template <class T>
static void apply()
{
register_compiler<T>();
}
};
template <class T>
using auto_register_compiler = auto_register<register_compiler_action, T>;
template <class Derived>
struct compiler : auto_register_compiler<Derived>
{
auto replace(const operation& op) const
{
return
[=](module& m, instruction_ref ins) { m.replace_instruction(ins, op, ins->inputs()); };
}
operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILER_HPP
......@@ -154,6 +154,13 @@ struct hip_device
std::size_t get_cu_count() const { return device_props.multiProcessorCount; }
std::size_t get_max_workitems_per_cu() const
{
return device_props.maxThreadsPerMultiProcessor;
}
std::size_t get_max_workitems_per_block() const { return device_props.maxThreadsPerBlock; }
private:
std::size_t device_id = 0;
std::size_t current_stream = 0;
......@@ -235,6 +242,8 @@ struct context
this->current_device = std::make_shared<hip_device>(0, n_streams);
}
any_ptr get_queue() { return get_stream().get(); }
private:
// TODO: Make this a vector to support multiple devices
std::shared_ptr<hip_device> current_device;
......
......@@ -76,8 +76,9 @@ void arg_op(Op op, hipStream_t stream, const argument& result, const argument& a
size_t batch_item_num = batch_lens[axis];
batch_lens[axis] = 1;
migraphx::shape batch_shape{arg_shape.type(), batch_lens};
migraphx::shape std_arg_shape{arg_shape.type(), arg_shape.lens()};
hip_visit_all(arg, arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
hip_visit_all(arg, std_arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
auto* output = device_cast(result.get<int64_t>().data());
using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
// use one block for items in one batch.
......
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void fill(hipStream_t stream, const argument& result, unsigned long val);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void multinomial(hipStream_t stream,
const argument& result,
const argument& arg0,
const argument& arg1);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
......@@ -10,7 +10,12 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis);
void prefix_scan_sum(hipStream_t stream,
const argument& result,
const argument& arg,
int32_t axis,
bool exclusive,
bool reverse);
} // namespace device
} // namespace gpu
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment