Commit eb0d8fee authored by Paul's avatar Paul
Browse files

Merge branch 'develop' into driver

parents 65ef35cd 0d796941
......@@ -2,10 +2,16 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <migraphx/program.hpp>
#include <migraphx/quantization.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/cpu/target.hpp>
#include <migraphx/onnx.hpp>
#include <migraphx/stringutils.hpp>
#ifdef ENABLE_TF
#include <migraphx/tf.hpp>
#else
#include <migraphx/onnx.hpp>
#endif
#ifdef HAVE_GPU
#include <migraphx/gpu/target.hpp>
#include <migraphx/gpu/hip.hpp>
......@@ -28,6 +34,11 @@ struct throw_half
{
throw std::runtime_error("Half not supported in python yet.");
}
void operator()(migraphx::tensor_view<migraphx::half>) const
{
throw std::runtime_error("Half not supported in python yet.");
}
};
template <class F>
......@@ -42,6 +53,8 @@ struct skip_half
}
void operator()(migraphx::shape::as<migraphx::half>) const {}
void operator()(migraphx::tensor_view<migraphx::half>) const {}
};
template <class F>
......@@ -50,6 +63,12 @@ void visit_type(const migraphx::shape& s, F f)
s.visit_type(throw_half<F>{f});
}
template <class T, class F>
void visit(const migraphx::raw_data<T>& x, F f)
{
x.visit(throw_half<F>{f});
}
template <class F>
void visit_types(F f)
{
......@@ -60,6 +79,9 @@ template <class T>
py::buffer_info to_buffer_info(T& x)
{
migraphx::shape s = x.get_shape();
auto strides = s.strides();
std::transform(
strides.begin(), strides.end(), strides.begin(), [&](auto i) { return i * s.type_size(); });
py::buffer_info b;
visit_type(s, [&](auto as) {
b = py::buffer_info(x.data(),
......@@ -67,7 +89,7 @@ py::buffer_info to_buffer_info(T& x)
py::format_descriptor<decltype(as())>::format(),
s.lens().size(),
s.lens(),
s.strides());
strides);
});
return b;
}
......@@ -75,11 +97,20 @@ py::buffer_info to_buffer_info(T& x)
migraphx::shape to_shape(const py::buffer_info& info)
{
migraphx::shape::type_t t;
std::size_t n = 0;
visit_types([&](auto as) {
if(info.format == py::format_descriptor<decltype(as())>::format())
{
t = as.type_enum();
n = sizeof(as());
}
});
auto strides = info.strides;
std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto i) -> std::size_t {
return n > 0 ? i / n : 0;
});
return migraphx::shape{t, info.shape, info.strides};
return migraphx::shape{t, info.shape, strides};
}
PYBIND11_MODULE(migraphx, m)
......@@ -108,6 +139,13 @@ PYBIND11_MODULE(migraphx, m)
py::buffer_info info = b.request();
new(&x) migraphx::argument(to_shape(info), info.ptr);
})
.def("get_shape", &migraphx::argument::get_shape)
.def("tolist",
[](migraphx::argument& x) {
py::list l{x.get_shape().elements()};
visit(x, [&](auto data) { l = py::cast(data.to_vector()); });
return l;
})
.def("__eq__", std::equal_to<migraphx::argument>{})
.def("__ne__", std::not_equal_to<migraphx::argument>{})
.def("__repr__", [](const migraphx::argument& x) { return migraphx::to_string(x); });
......@@ -123,8 +161,16 @@ PYBIND11_MODULE(migraphx, m)
.def("__ne__", std::not_equal_to<migraphx::program>{})
.def("__repr__", [](const migraphx::program& p) { return migraphx::to_string(p); });
#ifdef ENABLE_TF
m.def("parse_tf",
&migraphx::parse_tf,
"Parse tf protobuf (default format is nhwc)",
py::arg("filename"),
py::arg("is_nhwc") = true);
#else
m.def("parse_onnx", &migraphx::parse_onnx);
#endif
m.def("get_target", [](const std::string& name) -> migraphx::target {
if(name == "cpu")
return migraphx::cpu::target{};
......@@ -136,6 +182,10 @@ PYBIND11_MODULE(migraphx, m)
});
m.def("generate_argument", &migraphx::generate_argument, py::arg("s"), py::arg("seed") = 0);
m.def("quantize", [](migraphx::program& p, std::vector<std::string>& ins_names) {
migraphx::quantize(p, ins_names);
});
m.def("quantize", [](migraphx::program& p) { migraphx::quantize(p, {"all"}); });
#ifdef HAVE_GPU
m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);
......
#include <migraphx/quantization.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/op/convert.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/ranges.hpp>
#include <utility>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
instruction_ref insert_fp16(program& prog,
instruction_ref& ins,
shape::type_t type,
std::unordered_map<instruction_ref, instruction_ref>& map_fp16)
{
if(map_fp16.count(ins) > 0)
{
return map_fp16[ins];
}
assert(ins->get_shape().type() == shape::float_type ||
ins->get_shape().type() == shape::double_type);
instruction_ref ins_fp16{};
ins_fp16 = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
map_fp16[ins] = ins_fp16;
return ins_fp16;
}
void quantize(program& prog, const std::vector<std::string>& ins_names)
{
std::unordered_map<instruction_ref, instruction_ref> map_fp16;
for(auto ins : iterator_for(prog))
{
// all indicates every instruction is converted
if((not contains(ins_names, "all")) and (not contains(ins_names, ins->name())))
{
continue;
}
shape::type_t orig_type = ins->get_shape().type();
// process all inputs, if input is a fp32 or fp64, convert it
// to a fp16 by adding a convert operator.
auto inputs = ins->inputs();
std::vector<instruction_ref> converted_inputs;
for(auto input : inputs)
{
auto s = input->get_shape();
if(s.type() == shape::float_type || s.type() == shape::double_type)
{
// if the input is a convert operator, uses its input
// as its current input
instruction_ref input_fp16{};
if(input->name() == "convert")
{
input_fp16 = input->inputs().front();
}
else
{
input_fp16 = insert_fp16(prog, input, shape::half_type, map_fp16);
}
converted_inputs.push_back(input_fp16);
}
else
{
converted_inputs.push_back(input);
}
}
// no change for the input, go to the next instruction
if(inputs == converted_inputs)
{
continue;
}
auto op = ins->get_operator();
auto ins_shape = compute_shape(op, converted_inputs);
if(ins_shape.type() != orig_type)
{
// insert another convert instruction to convert it back
if(ins == std::prev(prog.end()))
{
prog.add_instruction(op::convert{orig_type}, ins);
}
else
{
// check the dead code case to avoid assert
bool output_empty = ins->outputs().empty();
auto ins_orig_type =
prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
if(!output_empty)
{
prog.replace_instruction(ins, ins_orig_type);
}
}
}
prog.replace_instruction(ins, op, converted_inputs);
}
}
void quantize(program& prog) { quantize(prog, {"all"}); }
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -4,6 +4,7 @@
#include <migraphx/operators.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/op/common.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -16,11 +17,14 @@ void rewrite_rnn::apply(program& prog) const
{
apply_vanilla_rnn(prog, ins);
}
if(ins->name() == "gru")
else if(ins->name() == "gru")
{
apply_gru(prog, ins);
}
else if(ins->name() == "lstm")
{
apply_lstm(prog, ins);
}
}
}
......@@ -210,7 +214,7 @@ std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
auto wb = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
auto rb = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
auto b = prog.insert_instruction(ins, op::add{}, wb, rb);
bias = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, b);
bias = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape().lens()}, b);
}
instruction_ref hidden_out = prog.end();
......@@ -517,25 +521,26 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
instruction_ref brcst_bh{};
if(bias != prog.end())
{
auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
auto wbz = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
auto wbr = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
auto wbh = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
brcst_wbh = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, wbh);
auto broadcast_lens = sih->get_shape().lens();
auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
auto wbz = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
auto wbr = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
auto wbh = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
brcst_wbh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, wbh);
auto rbz = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sbias);
auto rbr = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {5 * hs}}, sbias);
auto rbh = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
brcst_rbh = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, rbh);
brcst_rbh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, rbh);
auto bz = prog.insert_instruction(ins, op::add{}, wbz, rbz);
brcst_bz = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, bz);
brcst_bz = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, bz);
auto br = prog.insert_instruction(ins, op::add{}, wbr, rbr);
brcst_br = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, br);
brcst_br = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, br);
auto bh = prog.insert_instruction(ins, op::add{}, wbh, rbh);
brcst_bh = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, bh);
brcst_bh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, bh);
}
for(long i = 0; i < seq_len; i++)
......@@ -664,5 +669,514 @@ std::vector<operation> rewrite_rnn::gru_actv_funcs(instruction_ref ins) const
}
}
// for lstm operators
void rewrite_rnn::apply_lstm(program& prog, instruction_ref ins) const
{
assert(ins->name() == "lstm");
auto args = ins->inputs();
shape seq_shape = args[0]->get_shape();
std::size_t hidden_size = args[2]->get_shape().lens()[2];
std::size_t batch_size = seq_shape.lens()[1];
shape::type_t type = seq_shape.type();
migraphx::shape ihc_shape{type, {1, batch_size, hidden_size}};
std::vector<float> ihc_data(ihc_shape.elements(), 0.0);
migraphx::shape pph_shape{type, {1, 3 * hidden_size}};
std::vector<float> pph_data(pph_shape.elements(), 0.0);
auto actv_funcs = lstm_actv_funcs(ins);
auto lstm_op = any_cast<op::lstm>(ins->get_operator());
op::rnn_direction dirct = lstm_op.direction;
instruction_ref last_output{};
instruction_ref last_cell_output{};
if(dirct == op::rnn_direction::bidirectional)
{
// input weight matrix
// input weight matrix
auto w_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[1]);
auto w_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[1]);
// hidden state weight matrix
auto r_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[2]);
auto r_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[2]);
// process bias
instruction_ref bias_forward = prog.end();
instruction_ref bias_reverse = prog.end();
if(args.size() >= 4 && args[3]->name() != "undefined")
{
bias_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[3]);
bias_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[3]);
}
// process intial hidden state, it is the 6th argument
instruction_ref ih_forward{};
instruction_ref ih_reverse{};
if(args.size() >= 6 && args[5]->name() != "undefined")
{
ih_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[5]);
ih_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[5]);
}
else
{
ih_forward = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
ih_reverse = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
}
// process initial cell value
instruction_ref ic_forward{};
instruction_ref ic_reverse{};
if(args.size() >= 7 && args[6]->name() != "undefined")
{
ic_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[6]);
ic_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[6]);
}
else
{
ic_forward = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
ic_reverse = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
}
// process weight of the peephole
instruction_ref pph_forward = prog.end();
instruction_ref pph_reverse = prog.end();
if(args.size() == 8 && args[7]->name() != "undefined")
{
pph_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[7]);
pph_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[7]);
}
auto ret_forward = lstm_cell(
true,
prog,
ins,
{args[0], w_forward, r_forward, bias_forward, ih_forward, ic_forward, pph_forward},
actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(2));
auto ret_reverse = lstm_cell(
false,
prog,
ins,
{args[0], w_reverse, r_reverse, bias_reverse, ih_reverse, ic_reverse, pph_reverse},
actv_funcs.at(3),
actv_funcs.at(4),
actv_funcs.at(5));
auto concat_output =
prog.insert_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
last_output = prog.insert_instruction(ins, op::squeeze{{0}}, concat_output);
// last cell output
last_cell_output =
prog.insert_instruction(ins, op::concat{0}, ret_forward[2], ret_reverse[2]);
// the following logic is to ensure the last instruction is a concat
if(ret_forward[0] == prog.end())
{
prog.replace_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
}
else
{
ret_forward[0] =
prog.insert_instruction(ins, op::concat{0}, ret_forward[0], ret_forward[1]);
ret_reverse[0] =
prog.insert_instruction(ins, op::concat{0}, ret_reverse[1], ret_reverse[0]);
prog.replace_instruction(ins, op::concat{1}, {ret_forward[0], ret_reverse[0]});
}
}
else
{
bool is_forward = (dirct == op::rnn_direction::forward);
// weight matrices
auto w = args[1];
auto r = args[2];
// bias
instruction_ref bias = prog.end();
if(args.size() >= 4 && args[3]->name() != "undefined")
{
bias = args[3];
}
// initial hidden state
instruction_ref ih{};
if(args.size() >= 6 && args[5]->name() != "undefined")
{
ih = args[5];
}
else
{
ih = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
}
// initial cell value
instruction_ref ic{};
if(args.size() >= 7 && args[6]->name() != "undefined")
{
ic = args[6];
}
else
{
ic = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
}
// process weight of the peephole
instruction_ref pph = prog.end();
if(args.size() == 8 && args[7]->name() != "undefined")
{
pph = args[7];
}
auto ret = lstm_cell(is_forward,
prog,
ins,
{args[0], w, r, bias, ih, ic, pph},
actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(2));
last_output = prog.insert_instruction(ins, op::squeeze{{0}}, ret[1]);
last_cell_output = ret[2];
if(ret[0] == prog.end())
{
prog.replace_instruction(ins, op::concat{0}, ret[1]);
}
else
{
auto concat_arg0 = is_forward ? ret[0] : ret[1];
auto concat_arg1 = is_forward ? ret[1] : ret[0];
prog.replace_instruction(ins, op::concat{0}, concat_arg0, concat_arg1);
}
}
// replace the corresponding lstm_last_output instruction
// with the last_output, and the lstm_last_cell_output with
// the last_cell_output. The while loop is to handle the case
// of multiple lstm_last_output and lstm_last_cell_output
// operators
auto last_output_it = ins->outputs().begin();
while(last_output_it != ins->outputs().end())
{
last_output_it = std::find_if(last_output_it, ins->outputs().end(), [](auto i) {
return i->name() == "rnn_last_output";
});
if(last_output_it != ins->outputs().end())
{
prog.replace_instruction(*last_output_it, last_output);
last_output_it++;
}
}
auto last_cell_output_it = ins->outputs().begin();
while(last_cell_output_it != ins->outputs().end())
{
last_cell_output_it = std::find_if(last_cell_output_it, ins->outputs().end(), [](auto i) {
return i->name() == "lstm_last_cell_output";
});
if(last_cell_output_it != ins->outputs().end())
{
prog.replace_instruction(*last_cell_output_it, last_cell_output);
last_cell_output_it++;
}
}
}
std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
program& prog,
instruction_ref ins,
std::vector<instruction_ref> inputs,
const operation& actv_func1,
const operation& actv_func2,
const operation& actv_func3) const
{
// must have 7 args in the input vector
assert(inputs.size() == 7);
auto seq = inputs.at(0);
auto w = inputs.at(1);
auto r = inputs.at(2);
auto bias = inputs.at(3);
auto ih = inputs.at(4);
auto ic = inputs.at(5);
auto pph = inputs.at(6);
instruction_ref hidden_states = prog.end();
instruction_ref last_output{};
instruction_ref last_cell_output{};
migraphx::shape seq_shape = seq->get_shape();
migraphx::shape r_shape = r->get_shape();
long seq_len = static_cast<long>(seq_shape.lens()[0]);
long hs = static_cast<long>(r_shape.lens()[2]);
std::vector<int64_t> perm{1, 0};
// w matrix
auto sw = prog.insert_instruction(ins, op::squeeze{{0}}, w);
auto wi = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sw);
auto tran_wi = prog.insert_instruction(ins, op::transpose{perm}, wi);
auto wo = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sw);
auto tran_wo = prog.insert_instruction(ins, op::transpose{perm}, wo);
auto wf = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sw);
auto tran_wf = prog.insert_instruction(ins, op::transpose{perm}, wf);
auto wc = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sw);
auto tran_wc = prog.insert_instruction(ins, op::transpose{perm}, wc);
// r matrix
auto sr = prog.insert_instruction(ins, op::squeeze{{0}}, r);
auto ri = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sr);
auto tran_ri = prog.insert_instruction(ins, op::transpose{perm}, ri);
auto ro = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sr);
auto tran_ro = prog.insert_instruction(ins, op::transpose{perm}, ro);
auto rf = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sr);
auto tran_rf = prog.insert_instruction(ins, op::transpose{perm}, rf);
auto rc = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sr);
auto tran_rc = prog.insert_instruction(ins, op::transpose{perm}, rc);
// initial hidden state
auto sih = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
// initial cell state
auto sic = prog.insert_instruction(ins, op::squeeze{{0}}, ic);
auto ic_lens = sic->get_shape().lens();
// bias
instruction_ref bi_brcst{};
instruction_ref bo_brcst{};
instruction_ref bf_brcst{};
instruction_ref bc_brcst{};
if(bias != prog.end())
{
auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
auto bxi = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
auto bhi = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {5 * hs}}, sbias);
auto bi = prog.insert_instruction(ins, op::add{}, bxi, bhi);
bi_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bi);
auto bxo = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
auto bho = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
auto bo = prog.insert_instruction(ins, op::add{}, bxo, bho);
bo_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bo);
auto bxf = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
auto bhf = prog.insert_instruction(ins, op::slice{{0}, {6 * hs}, {7 * hs}}, sbias);
auto bf = prog.insert_instruction(ins, op::add{}, bxf, bhf);
bf_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bf);
auto bxc = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sbias);
auto bhc = prog.insert_instruction(ins, op::slice{{0}, {7 * hs}, {8 * hs}}, sbias);
auto bc = prog.insert_instruction(ins, op::add{}, bxc, bhc);
bc_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bc);
}
// peep hole
instruction_ref pphi_brcst{};
instruction_ref ppho_brcst{};
instruction_ref pphf_brcst{};
if(pph != prog.end())
{
auto spph = prog.insert_instruction(ins, op::squeeze{{0}}, pph);
auto pphi = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, spph);
pphi_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, pphi);
auto ppho = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, spph);
ppho_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, ppho);
auto pphf = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, spph);
pphf_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, pphf);
}
for(long i = 0; i < seq_len; ++i)
{
long seq_index = is_forward ? i : (seq_len - 1 - i);
auto xt = prog.insert_instruction(ins, op::slice{{0}, {seq_index}, {seq_index + 1}}, seq);
xt = prog.insert_instruction(ins, op::squeeze{{0}}, xt);
// equation it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
auto xt_wi = prog.insert_instruction(ins, op::dot{}, xt, tran_wi);
auto ht_ri = prog.insert_instruction(ins, op::dot{}, sih, tran_ri);
auto it_before_actv = prog.insert_instruction(ins, op::add{}, xt_wi, ht_ri);
if(pph != prog.end())
{
auto pphi_ct = prog.insert_instruction(ins, op::mul{}, pphi_brcst, sic);
it_before_actv = prog.insert_instruction(ins, op::add{}, it_before_actv, pphi_ct);
}
if(bias != prog.end())
{
it_before_actv = prog.insert_instruction(ins, op::add{}, it_before_actv, bi_brcst);
}
auto it = prog.insert_instruction(ins, actv_func1, it_before_actv);
// equation ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
auto xt_wf = prog.insert_instruction(ins, op::dot{}, xt, tran_wf);
auto ht_rf = prog.insert_instruction(ins, op::dot{}, sih, tran_rf);
auto ft_before_actv = prog.insert_instruction(ins, op::add{}, xt_wf, ht_rf);
if(pph != prog.end())
{
auto pphf_ct = prog.insert_instruction(ins, op::mul{}, pphf_brcst, sic);
ft_before_actv = prog.insert_instruction(ins, op::add{}, ft_before_actv, pphf_ct);
}
if(bias != prog.end())
{
ft_before_actv = prog.insert_instruction(ins, op::add{}, ft_before_actv, bf_brcst);
}
auto ft = prog.insert_instruction(ins, actv_func1, ft_before_actv);
// equation ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
auto xt_wc = prog.insert_instruction(ins, op::dot{}, xt, tran_wc);
auto ht_rc = prog.insert_instruction(ins, op::dot{}, sih, tran_rc);
auto ct_before_actv = prog.insert_instruction(ins, op::add{}, xt_wc, ht_rc);
if(bias != prog.end())
{
ct_before_actv = prog.insert_instruction(ins, op::add{}, ct_before_actv, bc_brcst);
}
auto ct = prog.insert_instruction(ins, actv_func2, ct_before_actv);
// equation Ct = ft (.) Ct-1 + it (.) ct
auto ft_cell = prog.insert_instruction(ins, op::mul{}, ft, sic);
auto it_ct = prog.insert_instruction(ins, op::mul{}, it, ct);
auto cellt = prog.insert_instruction(ins, op::add{}, ft_cell, it_ct);
last_cell_output = cellt;
// ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
auto xt_wo = prog.insert_instruction(ins, op::dot{}, xt, tran_wo);
auto ht_ro = prog.insert_instruction(ins, op::dot{}, sih, tran_ro);
auto ot_before_actv = prog.insert_instruction(ins, op::add{}, xt_wo, ht_ro);
if(pph != prog.end())
{
auto ppho_cellt = prog.insert_instruction(ins, op::mul{}, ppho_brcst, cellt);
ot_before_actv = prog.insert_instruction(ins, op::add{}, ot_before_actv, ppho_cellt);
}
if(bias != prog.end())
{
ot_before_actv = prog.insert_instruction(ins, op::add{}, ot_before_actv, bo_brcst);
}
auto ot = prog.insert_instruction(ins, actv_func1, ot_before_actv);
// Ht = ot (.) h(Ct)
auto h_cellt = prog.insert_instruction(ins, actv_func3, cellt);
auto ht = prog.insert_instruction(ins, op::mul{}, ot, h_cellt);
sic = cellt;
sih = ht;
last_output = prog.insert_instruction(ins, op::unsqueeze{{0, 1}}, ht);
if(i < seq_len - 1)
{
if(i == 0)
{
hidden_states = last_output;
}
else
{
auto concat_arg0 = is_forward ? hidden_states : last_output;
auto concat_arg1 = is_forward ? last_output : hidden_states;
hidden_states =
prog.insert_instruction(ins, op::concat{0}, concat_arg0, concat_arg1);
}
}
}
last_cell_output = prog.insert_instruction(ins, op::unsqueeze{{0}}, last_cell_output);
return {hidden_states, last_output, last_cell_output};
}
std::vector<operation> rewrite_rnn::lstm_actv_funcs(instruction_ref ins) const
{
auto lstm_op = any_cast<op::lstm>(ins->get_operator());
// before rewrite the lstm operator, need to ensure
// we have 6 actv funcs, even though a user does not
// specifiy any actv func. If less than 46, use the
// algorithm in parse_lstm to make 6 actv functions
const auto& actv_funcs = lstm_op.actv_funcs;
std::size_t num_actv_funcs = actv_funcs.size();
if(lstm_op.direction == op::rnn_direction::bidirectional)
{
switch(num_actv_funcs)
{
case 0:
return {op::sigmoid{}, op::tanh{}, op::tanh{}, op::sigmoid{}, op::tanh{}, op::tanh{}};
case 1:
return {actv_funcs.at(0),
actv_funcs.at(0),
actv_funcs.at(0),
actv_funcs.at(0),
actv_funcs.at(0),
actv_funcs.at(0)};
case 2:
return {actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(1),
actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(1)};
case 3:
return {actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(2),
actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(2)};
case 4:
return {actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(2),
actv_funcs.at(3),
actv_funcs.at(3),
actv_funcs.at(3)};
case 5:
return {actv_funcs.at(0),
actv_funcs.at(1),
actv_funcs.at(2),
actv_funcs.at(3),
actv_funcs.at(4),
actv_funcs.at(4)};
default: return actv_funcs;
}
}
else
{
switch(num_actv_funcs)
{
case 0: return {op::sigmoid{}, op::tanh{}, op::tanh{}};
case 1: return {actv_funcs.at(0), actv_funcs.at(0), actv_funcs.at(0)};
case 2: return {actv_funcs.at(0), actv_funcs.at(1), actv_funcs.at(1)};
default: return actv_funcs;
}
}
}
namespace op {
std::ostream& operator<<(std::ostream& os, rnn_direction v)
{
std::vector<std::string> rnn_direction_str = {"forward", "reverse", "bidirectional"};
os << rnn_direction_str[static_cast<std::underlying_type<rnn_direction>::type>(v)];
return os;
}
} // namespace op
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/schedule.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/op/identity.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/functional.hpp>
#include <migraphx/ranges.hpp>
#include <unordered_map>
#include <unordered_set>
#include <set>
#include <deque>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
auto get_inputs()
{
return [](auto i) { return i->inputs(); };
}
auto get_outputs()
{
return [](auto i) { return i->outputs(); };
}
struct stream_info
{
std::unordered_map<instruction_ref, std::size_t> ins2stream;
std::unordered_map<instruction_ref, std::size_t> weights;
std::unordered_map<instruction_ref, std::size_t> iweights;
void accumulate_weights(instruction_ref last, const schedule_model& model)
{
fix<std::size_t>([&](auto self, auto ins) -> std::size_t {
if(not contains(weights, ins))
{
std::size_t weight = 0;
auto&& op = ins->get_operator();
if(not is_context_free(op) and op.name()[0] != '@')
weight = model.weight(op);
iweights[ins] = weight;
weights[ins] =
std::accumulate(ins->inputs().begin(),
ins->inputs().end(),
weight,
[&](std::size_t w, instruction_ref i) { return w + self(i); });
}
return weights[ins];
})(last);
}
std::vector<instruction_ref>::iterator sort_args(std::vector<instruction_ref>& args)
{
if(args.size() < 2)
{
return args.end();
}
const std::size_t min_partition_threshold = 2;
auto compare = by(std::greater<>{}, [&](auto x) {
return std::make_tuple(this->weights[x], x->inputs().size());
});
std::sort(args.begin(), args.end(), compare);
auto it = std::lower_bound(std::next(args.begin()),
args.end(),
min_partition_threshold,
[&](auto i, std::size_t w) { return this->weights[i] > w; });
assert(it == args.end() or this->weights[*it] <= min_partition_threshold);
assert(it == args.end() or std::prev(it) == args.begin() or
this->weights[*std::prev(it)] > min_partition_threshold);
return it;
}
struct partition
{
std::size_t weight = 0;
std::vector<instruction_ref> instructions{};
void add(instruction_ref ins, std::size_t w)
{
weight += w;
instructions.push_back(ins);
}
};
void assign_streams(program& p, std::size_t n)
{
partition critical;
std::unordered_map<instruction_ref, std::deque<partition>> partitions;
partitions.reserve(weights.size());
fix([&](auto self, auto ins, auto& part) {
assert(ins != p.end());
if(contains(partitions, ins))
return;
assert(p.has_instruction(ins));
// Add an entry so we know the instruction was visited
partitions[ins];
part.add(ins, this->iweights[ins]);
auto args = ins->inputs();
auto threshold_it = this->sort_args(args);
if(not args.empty())
{
assert(threshold_it != args.begin());
self(args.front(), part);
for(auto i : range(std::next(args.begin()), threshold_it))
{
partitions[ins].emplace_back();
self(i, partitions[ins].back());
}
for(auto i : range(threshold_it, args.end()))
{
self(i, part);
}
}
// Sort instructions
p.move_instruction(ins, p.end());
})(std::prev(p.end()), critical);
// Set the critical partition to stream 0
set_stream(critical, 0);
std::vector<std::size_t> streams(n - 1);
// Assign streams for the other partitions
for(auto&& ins_part : partitions)
{
std::sort(
ins_part.second.begin(), ins_part.second.end(), by(std::greater<>{}, [](auto&& x) {
return std::make_tuple(x.weight, x.instructions.size());
}));
for(auto&& part : ins_part.second)
{
auto stream = std::min_element(streams.begin(), streams.end()) - streams.begin();
set_stream(part, stream + 1);
streams[stream] += part.weight;
}
}
}
void set_stream(const partition& p, std::size_t n)
{
for(auto ins : p.instructions)
if(iweights[ins] > 0)
set_stream(ins, n);
}
void set_stream(instruction_ref ins, std::size_t n)
{
assert(iweights[ins] > 0);
ins2stream[ins] = n;
}
std::size_t get_stream(instruction_ref ins) const { return ins2stream.at(ins); }
bool has_stream(instruction_ref ins) const { return contains(ins2stream, ins); }
template <class F>
bool different(F f, std::size_t stream) const
{
bool result = false;
f([&](auto s) {
if(s != stream)
{
result = true;
return false;
}
// cppcheck-suppress uselessAssignmentArg
stream = s;
return true;
});
return result;
}
template <class F>
bool different(F f) const
{
bool result = false;
f([&](auto s) {
result = this->different(f, s);
return false;
});
return result;
}
template <class Selector>
auto get_streams_from(instruction_ref start, Selector select) const
{
return [=](auto f) {
return fix<bool>([&](auto self, auto ins) {
for(auto i : select(ins))
{
if(iweights.at(i) == 0)
{
if(not self(i))
return false;
}
else
{
if(not f(this->get_stream(i)))
return false;
}
}
return true;
})(start);
};
}
std::unordered_set<std::size_t> get_streams(instruction_ref ins) const
{
if(has_stream(ins))
return {get_stream(ins)};
std::unordered_set<std::size_t> result;
get_streams_from(ins, get_inputs())([&](auto s) {
result.insert(s);
return true;
});
return result;
}
template <class... Ts>
bool is_merge_point(instruction_ref ins, Ts... xs) const
{
return different(get_streams_from(ins, get_inputs()), xs...);
}
template <class... Ts>
bool is_split_point(instruction_ref ins, Ts... xs) const
{
return different(get_streams_from(ins, get_outputs()), xs...);
}
std::vector<instruction_ref> get_recorded_instructions(instruction_ref start)
{
std::vector<instruction_ref> result;
std::unordered_map<std::size_t, instruction_ref> m;
fix([&](auto self, auto ins) {
for(auto i : ins->inputs())
{
if(iweights.at(i) == 0)
{
self(i);
continue;
}
auto stream = this->get_stream(i);
if(not contains(m, stream))
m[stream] = i;
else
m[stream] = std::min(m[stream], i, by(std::less<>{}, [&](auto x) {
return std::distance(x, start);
}));
}
})(start);
std::transform(
m.begin(), m.end(), std::back_inserter(result), [](auto&& p) { return p.second; });
return result;
}
std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>>
find_concurrent_instructions(program& p)
{
std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>> result;
std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> merge_from;
result.reserve(p.size());
merge_from.reserve(p.size());
for(auto ins : reverse_iterator_for(p))
{
for(auto&& arg : ins->outputs())
{
if(is_merge_point(arg))
merge_from[ins].insert(arg);
merge_from[ins].insert(merge_from[arg].begin(), merge_from[arg].end());
}
auto streams = this->get_streams(ins);
// Collect concur instructions for each merge point.
for(auto& merge : merge_from[ins])
{
for(auto stream : streams)
{
if(result[merge].size() <= stream)
result[merge].resize(stream + 1);
auto&& r = result[merge][stream];
r.push_back(ins);
// Copy inputs if they dont have a stream(and are not a builtin and context
// free). Inputs without a stream can have a implicit dependency
std::copy_if(ins->inputs().begin(),
ins->inputs().end(),
std::back_inserter(r),
[&](auto x) {
return not this->has_stream(x) and
not is_context_free(x->get_operator()) and
x->name().front() != '@';
});
}
}
}
return result;
}
std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>
get_conflicts(program& p)
{
std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> conflict_table;
auto concur_ins = this->find_concurrent_instructions(p);
for(auto&& merge : concur_ins)
{
dfor(merge.second.size(), merge.second.size())([&](auto i, auto j) {
if(i == j)
return;
for(auto ins1 : merge.second[i])
{
auto p1 = std::distance(ins1, merge.first);
for(auto ins2 : merge.second[j])
{
if(ins1 == ins2)
continue;
auto p2 = std::distance(ins2, merge.first);
// The smaller distance means the instruction occurs later
if(p1 > p2)
conflict_table[ins2].insert(ins1);
else
conflict_table[ins1].insert(ins2);
}
}
});
}
// Remove duplicates
for(auto&& ip : conflict_table)
{
auto ins1 = ip.first;
for(auto ins2 : ip.second)
if(contains(conflict_table[ins2], ins1))
conflict_table[ins2].erase(ins1);
}
return conflict_table;
}
};
void schedule::apply(program& p) const
{
if(not enable)
return;
stream_info si;
auto last = std::prev(p.end());
si.accumulate_weights(last, model);
si.assign_streams(p, model.concurrency());
if(enabled(MIGRAPHX_TRACE_COMPILE{}))
{
p.annotate(std::cout, [&](auto ins) {
std::cout << ":";
std::cout << " weight=" << si.weights.at(ins);
std::cout << " input={";
si.get_streams_from(ins, get_inputs())([&](auto s) {
std::cout << s << ",";
return true;
});
std::cout << "}";
if(si.has_stream(ins))
std::cout << " stream=" << si.get_stream(ins);
});
std::cout << std::endl;
}
// Schedule instructions
std::size_t wait_id = 0;
std::unordered_map<instruction_ref, std::size_t> ins2wait;
std::unordered_map<std::size_t, std::unordered_set<std::size_t>> waited_for;
std::unordered_map<instruction_ref, std::unordered_set<std::size_t>> ins2waited;
ins2wait.reserve(p.size());
ins2waited.reserve(p.size());
for(auto ins : iterator_for(p))
{
// Only schedule instructions that have a stream
if(not si.has_stream(ins))
continue;
assert(si.weights[ins] > 0);
// Schedule instruction on the stream
auto stream = si.get_stream(ins);
assert(stream < model.concurrency());
model.sched(p, ins, stream);
// Insert wait instructions
if(si.is_merge_point(ins, stream))
{
for(auto i : si.get_recorded_instructions(ins))
{
if(not si.has_stream(i))
continue;
auto istream = si.get_stream(i);
if(stream == istream)
continue;
// Create a new event if it hasn't been recorded
if(not contains(ins2wait, i))
{
ins2wait[i] = wait_id;
model.record(p, i, wait_id);
wait_id++;
}
auto w = ins2wait.at(i);
// If we already waited for the event on this stream then dont
// insert another wait event
if(not contains(waited_for[stream], w))
model.wait(p, ins, w);
// Store the event as waited
waited_for[stream].insert(w);
// Store all wait events that have been waited on prior to the recorded instruction
waited_for[stream].insert(ins2waited[i].begin(), ins2waited[i].end());
}
}
// Store wait events that have already been waited on
if(si.is_split_point(ins, stream))
{
ins2waited[ins] = waited_for[stream];
}
}
// Add memory conflicts
auto conflict_table = si.get_conflicts(p);
for(auto&& ip : conflict_table)
{
if(ip.second.empty())
continue;
std::vector<instruction_ref> args;
args.push_back(ip.first);
args.insert(args.end(), ip.second.begin(), ip.second.end());
p.insert_instruction(std::next(ip.first), op::identity{}, args);
}
}
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -19,7 +19,7 @@ struct shape_impl
shape_impl() : m_type(shape::float_type), m_standard(false) {}
shape_impl(shape::type_t t) : m_type(t), m_lens({1}), m_strides({1}), m_standard(true) {}
shape_impl(shape::type_t t) : m_type(t), m_lens({1}), m_strides({0}), m_standard(true) {}
shape_impl(shape::type_t t, std::vector<std::size_t> l)
: m_type(t), m_lens(std::move(l)), m_standard(true)
{
......
#include <migraphx/simplify_algebra.hpp>
#include <migraphx/program.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/op/add.hpp>
#include <migraphx/matcher.hpp>
#include <migraphx/literal.hpp>
......
#include <migraphx/simplify_reshapes.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/op/as_shape.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/ranges.hpp>
#include <unordered_set>
......@@ -14,7 +14,9 @@ bool is_reshaper(instruction_ref ins)
// clang-format off
static const std::unordered_set<std::string> names = {
"reshape",
"contiguous"
"contiguous",
"squeeze",
"unsqueeze"
};
// clang-format on
return contains(names, ins->name());
......@@ -45,6 +47,9 @@ void simplify_reshapes::apply(program& p) const
auto end = std::prev(p.end());
for(auto ins : iterator_for(p))
{
if(ins == end and ins->name() == "contiguous")
continue;
// Skip possible dead instructions
if(ins->outputs().empty() and ins != end)
continue;
if(is_reshaper(ins))
......@@ -94,13 +99,6 @@ void simplify_reshapes::apply(program& p) const
p.replace_instruction(ins, t->inputs().front());
}
}
// Replace all reshapes with as_shape
for(auto ins : iterator_for(p))
{
if(ins->name() != "reshape")
continue;
p.replace_instruction(ins, op::as_shape{ins->get_shape()}, ins->inputs());
}
}
} // namespace MIGRAPHX_INLINE_NS
......
#include <migraphx/cpu/gemm.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/requires.hpp>
#include <migraphx/shape_for_each.hpp>
#include <blaze/math/CustomMatrix.h>
namespace migraphx {
......@@ -14,10 +15,13 @@ template <class T>
static auto make_mat(tensor_view<T> x)
{
const auto& s = x.get_shape();
assert(s.lens().size() == 2);
// assert(s.lens().size() == 2);
std::size_t n_dims = s.lens().size();
std::size_t dim_0 = n_dims - 2;
std::size_t dim_1 = n_dims - 1;
if(s.transposed())
return matrix<T>{x.data(), s.lens()[1], s.lens()[0], s.strides()[1]};
return matrix<T>{x.data(), s.lens()[0], s.lens()[1], s.strides()[0]};
return matrix<T>{x.data(), s.lens()[dim_1], s.lens()[dim_0], s.strides()[dim_1]};
return matrix<T>{x.data(), s.lens()[dim_0], s.lens()[dim_1], s.strides()[dim_0]};
}
template <class T, class F>
......@@ -51,7 +55,13 @@ void migemm_impl(tensor_view<T> cmat,
visit_mat(amat, [&](const auto& a) {
visit_mat(bmat, [&](const auto& b) {
auto c = make_mat(cmat);
c = (a * b) * alpha + beta * c;
c = beta * c;
// This is a simple optimization to avoid
// compute A * B if alpha is 0.0
if(alpha != 0.0)
{
c = c + alpha * a * b;
}
});
});
}
......@@ -64,18 +74,24 @@ void migemm_impl(tensor_view<T> cmat,
float beta,
std::false_type)
{
auto m = cmat.get_shape().lens()[0];
auto n = cmat.get_shape().lens()[1];
auto k = amat.get_shape().lens()[1];
std::size_t n_dims = cmat.get_shape().lens().size();
std::size_t dim_0 = n_dims - 2;
std::size_t dim_1 = n_dims - 1;
auto k = amat.get_shape().lens()[dim_1];
assert(amat.get_shape().lens()[1] == bmat.get_shape().lens()[0]);
assert(m == amat.get_shape().lens()[0]);
assert(n == bmat.get_shape().lens()[1]);
assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
dfor(m, n)([&](auto ii, auto jj) {
double s = cmat(ii, jj) * beta;
dfor(k)([&](auto kk) { s += amat(ii, kk) * bmat(kk, jj); });
cmat(ii, jj) = alpha * s;
shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
auto a_idx = c_idx;
auto b_idx = c_idx;
double s = 0.0;
dfor(k)([&](auto kk) {
a_idx[dim_1] = b_idx[dim_0] = kk;
s += amat(a_idx.begin(), a_idx.end()) * bmat(b_idx.begin(), b_idx.end());
});
cmat(c_idx.begin(), c_idx.end()) = alpha * s + cmat(c_idx.begin(), c_idx.end()) * beta;
});
}
......@@ -83,7 +99,18 @@ template <class T>
void migemm_impl(
tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
{
migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
auto lens = amat.get_shape().lens();
bool batch_mul =
std::accumulate(
lens.rbegin() + 2, lens.rend(), std::size_t{1}, std::multiplies<std::size_t>()) == 1;
if(batch_mul)
{
migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
}
else
{
migemm_impl(cmat, amat, bmat, alpha, beta, std::false_type{});
}
}
void migemm(
......
......@@ -7,6 +7,7 @@
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct pass;
namespace cpu {
struct target
......
......@@ -48,6 +48,12 @@ struct cpu_batch_norm_inference
{
op::batch_norm_inference op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::batch_norm_inference"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
......@@ -75,10 +81,10 @@ struct cpu_batch_norm_inference
par_dfor(num_batch, num_channels, image_height, image_width)(
[&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
assert((variance(c) + epsilon) > 0);
result(n, c, h, w) = gamma(c) * (buffer(n, c, h, w) - mean(c)) /
std::sqrt(variance(c) + epsilon) +
bias(c);
assert((variance[c] + epsilon) > 0);
result(n, c, h, w) = gamma[c] * (buffer(n, c, h, w) - mean[c]) /
std::sqrt(variance[c] + epsilon) +
bias[c];
});
});
}
......@@ -107,6 +113,12 @@ struct cpu_lrn
{
op::lrn op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::lrn"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
......@@ -117,7 +129,7 @@ struct cpu_lrn
int channels = output_shape.lens()[1];
int height = output_shape.lens()[2];
int width = output_shape.lens()[3];
float alphaoverarea = op.alpha / op.size;
float alphaoverarea = op.alpha / float(op.size);
int radius = (op.size - 1) / 2;
par_dfor(n_batch, height, width)([&](int b, int h, int w) {
......@@ -144,6 +156,12 @@ struct cpu_convolution
{
op::convolution op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::convolution"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
......@@ -165,15 +183,15 @@ struct cpu_convolution
output_shape.lens()[2],
output_shape.lens()[3])(
[&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
const int start_x = i * op.stride[0] - op.padding[0];
const int start_y = j * op.stride[1] - op.padding[1];
const int group_id = w / (wei_n / op.group);
const auto start_x = i * op.stride[0] - op.padding[0];
const auto start_y = j * op.stride[1] - op.padding[1];
const auto group_id = w / (wei_n / op.group);
double acc = 0;
dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
const int in_x = start_x + x;
const int in_y = start_y + y;
const int in_ch = group_id * wei_c + k;
const auto in_x = start_x + x;
const auto in_y = start_y + y;
const auto in_ch = group_id * wei_c + k;
if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
{
acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
......@@ -190,6 +208,12 @@ struct cpu_im2col
{
op::im2col op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
static std::string name() { return "cpu::im2col"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
......@@ -209,10 +233,8 @@ struct cpu_im2col
const std::size_t& stride_h = op.stride[0];
const std::size_t& stride_w = op.stride[1];
int kdiv2_h;
int kdiv2_w;
kdiv2_h = kernel_h / 2;
kdiv2_w = kernel_w / 2;
auto kdiv2_h = kernel_h / 2;
auto kdiv2_w = kernel_w / 2;
// calculate output sizes
const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
const std::size_t col_width = (width - kernel_w + 2 * pad_w) / stride_w + 1;
......@@ -230,8 +252,8 @@ struct cpu_im2col
dfor(channels,
kernel_h,
kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
int idx = iinput + koffset - kdiv2_h;
int jdx = jinput + loffset - kdiv2_w;
auto idx = iinput + koffset - kdiv2_h;
auto jdx = jinput + loffset - kdiv2_w;
col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
? input(0, c, idx, jdx)
: 0;
......@@ -273,6 +295,12 @@ struct cpu_pooling
{
op::pooling op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::pooling_" + Op::name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
......@@ -317,20 +345,35 @@ struct cpu_pooling
}
};
struct cpu_contiguous
struct cpu_op
{
op::contiguous op;
std::string name() const { return "cpu::contiguous"; }
operation op;
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
{
return op.compute(output_shape, args);
}
friend bool operator==(const cpu_op& x, const cpu_op& y) { return x.op == y.op; }
friend bool operator==(const cpu_op& x, const operation& y)
{
return op.compute(output_shape, std::move(args));
if(x.name() != y.name())
return false;
return x == any_cast<cpu_op>(y);
}
friend bool operator==(const operation& x, const cpu_op& y) { return y == x; }
};
struct cpu_pad
{
op::pad op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::contiguous"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
......@@ -354,184 +397,54 @@ struct cpu_pad
}
};
struct cpu_concat
{
op::concat op;
std::string name() const { return "cpu::concat"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
return op.compute(output_shape, std::move(args));
}
};
struct cpu_gemm
{
op::dot op;
std::string name() const { return "cpu::dot"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
template <class Self, class F>
static auto reflect(Self& self, F f)
{
argument result{output_shape};
migemm(result, args[0], args[1], op.alpha, op.beta);
return result;
return migraphx::reflect(self.op, f);
}
};
struct cpu_gather
{
op::gather op;
std::string name() const { return "cpu::gather"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
return op.compute(output_shape, std::move(args));
}
};
struct identity_op
{
std::string name() const { return "cpu::identity"; }
auto fcn() const
{
return [](auto x) { return x; };
}
};
struct abs_op
{
std::string name() const { return "cpu::abs"; }
auto fcn() const
{
return [](auto x) { return std::abs(make_signed(x)); };
}
};
struct exp_op
{
std::string name() const { return "cpu::exp"; }
auto fcn() const
{
return [](auto x) { return std::exp(x); };
}
};
struct log_op
{
std::string name() const { return "cpu::log"; }
auto fcn() const
{
return [](auto x) { return std::log(x); };
}
};
struct sin_op
{
std::string name() const { return "cpu::sin"; }
auto fcn() const
{
return [](auto x) { return std::sin(x); };
}
};
struct cos_op
{
std::string name() const { return "cpu::cos"; }
auto fcn() const
{
return [](auto x) { return std::cos(x); };
}
};
struct tan_op
{
std::string name() const { return "cpu::tan"; }
auto fcn() const
{
return [](auto x) { return std::tan(x); };
}
};
struct asin_op
{
std::string name() const { return "cpu::asin"; }
auto fcn() const
{
return [](auto x) { return std::asin(x); };
}
};
struct acos_op
{
std::string name() const { return "cpu::acos"; }
auto fcn() const
{
return [](auto x) { return std::acos(x); };
}
};
struct atan_op
{
std::string name() const { return "cpu::atan"; }
auto fcn() const
{
return [](auto x) { return std::atan(x); };
}
};
struct sinh_op
{
std::string name() const { return "cpu::sinh"; }
auto fcn() const
std::string name() const { return "cpu::dot"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
return [](auto x) { return std::sinh(x); };
if(inputs.size() == 3)
{
auto c_shape = inputs.at(2);
check_shapes{{c_shape}}.not_broadcasted();
}
return op.compute_shape(inputs);
}
};
struct cosh_op
{
std::string name() const { return "cpu::cosh"; }
auto fcn() const
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
return [](auto x) { return std::cosh(x); };
}
};
argument result{output_shape};
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrics, and C is broadcastable to A * B
if(args.size() == 3)
{
// no need to consider the value of args[2]
if(op.beta == 0.0f)
{
result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
}
else
{
visit_all(result, args[2])([&](auto output, auto input) {
std::copy(input.begin(), input.end(), output.begin());
});
}
struct tanh_op
{
std::string name() const { return "cpu::tanh"; }
auto fcn() const
{
return [](auto x) { return std::tanh(x); };
}
};
migemm(result, args[0], args[1], op.alpha, op.beta);
struct sigmoid_op
{
std::string name() const { return "cpu::sigmoid"; }
auto fcn() const
{
return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
}
};
return result;
}
struct neg_op
{
std::string name() const { return "cpu::neg"; }
auto fcn() const
{
return [](auto x) { return -x; };
}
};
// 2 input arguments
migemm(result, args[0], args[1], op.alpha, 0.0f);
struct relu_op
{
std::string name() const { return "cpu::relu"; }
auto fcn() const
{
return [](auto x) { return std::max(decltype(x){0}, x); };
return result;
}
};
......@@ -561,16 +474,45 @@ template <typename Op>
struct cpu_unary
{
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op.op, f);
}
std::string name() const { return op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs}.has(1);
auto s = inputs.at(0);
if(s.packed())
{
return s;
}
else
{
return {s.type(), s.lens()};
}
}
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
result.visit([&](auto output) {
args[0].visit([&](auto input) {
std::transform(input.begin(), input.end(), output.begin(), op.fcn());
if(input.get_shape().standard())
{
std::transform(input.begin(), input.end(), output.begin(), op.fcn());
}
else
{
shape_for_each(output.get_shape(), [&](const auto& idx) {
output(idx.begin(), idx.end()) = op.fcn()(input(idx.begin(), idx.end()));
});
}
});
});
return result;
}
};
......@@ -590,20 +532,20 @@ struct softmax2d
auto nw = input.get_shape().lens()[3];
dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
value_type cmax = std::numeric_limits<value_type>::lowest();
for(int c = 0; c < nc; c++)
for(std::size_t c = 0; c < nc; c++)
{
cmax = std::max(cmax, input(b, c, i, j));
}
for(int c = 0; c < nc; c++)
for(std::size_t c = 0; c < nc; c++)
{
output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
}
value_type sum = value_type(0);
for(int c = 0; c < nc; c++)
for(std::size_t c = 0; c < nc; c++)
{
sum += output(b, c, i, j);
}
for(int c = 0; c < nc; c++)
for(std::size_t c = 0; c < nc; c++)
{
output(b, c, i, j) = output(b, c, i, j) / sum;
}
......@@ -613,83 +555,78 @@ struct softmax2d
}
};
struct add_op
{
std::string name() const { return "add"; }
auto fcn() const
{
return [](auto x, auto y) { return x + y; };
}
};
struct sub_op
{
std::string name() const { return "sub"; }
auto fcn() const
{
return [](auto x, auto y) { return x - y; };
}
};
struct mul_op
struct cpu_logsoftmax
{
std::string name() const { return "mul"; }
auto fcn() const
{
return [](auto x, auto y) { return x * y; };
}
};
op::logsoftmax op;
struct div_op
{
std::string name() const { return "div"; }
auto fcn() const
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return [](auto x, auto y) { return x / y; };
return migraphx::reflect(self.op, f);
}
};
struct max_op
{
std::string name() const { return "max"; }
auto fcn() const
{
return [](auto x, auto y) { return std::max(x, y); };
}
};
std::string name() const { return "cpu::logsoftmax"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
struct min_op
{
std::string name() const { return "min"; }
auto fcn() const
template <typename T>
std::size_t compute_batch_index(const T& idx, shape& batch_shape, int axis) const
{
return [](auto x, auto y) { return std::min(x, y); };
if(axis == 0)
{
return 0;
}
else
{
std::vector<std::size_t> batch_idx(idx.begin(), idx.begin() + axis);
return batch_shape.index(batch_idx.begin(), batch_idx.end());
}
}
};
template <typename Op>
struct cpu_binary
{
Op op;
std::string name() const { return op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
if(input1.get_shape().packed() and input2.get_shape().packed())
{
std::transform(
input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
}
else
auto lens = output_shape.lens();
std::vector<std::size_t> batch_lens{};
if(op.axis == 0)
{
batch_lens.push_back(1);
}
else
{
batch_lens.insert(batch_lens.begin(), lens.begin(), lens.begin() + op.axis);
}
shape batch_shape{migraphx::shape::uint32_type, batch_lens};
visit_all(result, args[0])([&](auto output, auto input) {
using value_type = typename decltype(input)::value_type;
std::vector<value_type> batch_max(batch_shape.elements(),
std::numeric_limits<value_type>::lowest());
shape_for_each(output_shape, [&](auto idx) {
auto index = this->compute_batch_index(idx, batch_shape, op.axis);
batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
});
shape_for_each(output_shape, [&](auto idx) {
auto index = this->compute_batch_index(idx, batch_shape, op.axis);
output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
});
std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
shape_for_each(output_shape, [&](auto idx) {
auto index = this->compute_batch_index(idx, batch_shape, op.axis);
batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
});
for(std::size_t i = 0; i < batch_sum.size(); ++i)
{
shape_for_each(output.get_shape(), [&](const auto& idx) {
output(idx.begin(), idx.end()) =
op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
});
batch_sum[i] = std::log(batch_sum[i]);
}
shape_for_each(output_shape, [&](auto idx) {
auto index = this->compute_batch_index(idx, batch_shape, op.axis);
output(idx.begin(), idx.end()) -= batch_sum[index];
});
});
return result;
}
};
......@@ -713,42 +650,17 @@ struct cpu_apply
void init()
{
apply_map["im2col"] = extend_op<cpu_im2col, op::im2col>();
apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
apply_map["dot"] = extend_op<cpu_gemm, op::dot>();
apply_map["batch_norm_inference"] =
extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
apply_map["lrn"] = extend_op<cpu_lrn, op::lrn>();
apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
apply_map["pad"] = extend_op<cpu_pad, op::pad>();
apply_map["concat"] = extend_op<cpu_concat, op::concat>();
apply_map["gather"] = extend_op<cpu_gather, op::gather>();
apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
apply_map["elu"] = extend_op<cpu_unary<elu_op>, op::elu>();
apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
apply_map["abs"] = simple_op<cpu_unary<abs_op>>();
apply_map["sinh"] = simple_op<cpu_unary<sinh_op>>();
apply_map["cosh"] = simple_op<cpu_unary<cosh_op>>();
apply_map["tanh"] = simple_op<cpu_unary<tanh_op>>();
apply_map["sigmoid"] = simple_op<cpu_unary<sigmoid_op>>();
apply_map["exp"] = simple_op<cpu_unary<exp_op>>();
apply_map["log"] = simple_op<cpu_unary<log_op>>();
apply_map["neg"] = simple_op<cpu_unary<neg_op>>();
apply_map["sin"] = simple_op<cpu_unary<sin_op>>();
apply_map["cos"] = simple_op<cpu_unary<cos_op>>();
apply_map["tan"] = simple_op<cpu_unary<tan_op>>();
apply_map["asin"] = simple_op<cpu_unary<asin_op>>();
apply_map["acos"] = simple_op<cpu_unary<acos_op>>();
apply_map["atan"] = simple_op<cpu_unary<atan_op>>();
apply_map["relu"] = simple_op<cpu_unary<relu_op>>();
apply_map["add"] = simple_op<cpu_binary<add_op>>();
apply_map["sub"] = simple_op<cpu_binary<sub_op>>();
apply_map["mul"] = simple_op<cpu_binary<mul_op>>();
apply_map["div"] = simple_op<cpu_binary<div_op>>();
apply_map["max"] = simple_op<cpu_binary<max_op>>();
apply_map["min"] = simple_op<cpu_binary<min_op>>();
apply_map["softmax"] = simple_op<softmax2d>();
apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
apply_map["dot"] = extend_op<cpu_gemm, op::dot>();
apply_map["elu"] = extend_op<cpu_unary<elu_op>, op::elu>();
apply_map["im2col"] = extend_op<cpu_im2col, op::im2col>();
apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
apply_map["logsoftmax"] = extend_op<cpu_logsoftmax, op::logsoftmax>();
apply_map["lrn"] = extend_op<cpu_lrn, op::lrn>();
apply_map["pad"] = extend_op<cpu_pad, op::pad>();
apply_map["softmax"] = simple_op<softmax2d>();
}
void apply()
......@@ -764,9 +676,18 @@ struct cpu_apply
{
apply_map.at(it->name())(it);
}
else if(is_context_free(it->get_operator()))
{
apply_cpu_op(it);
}
}
}
void apply_cpu_op(instruction_ref ins)
{
prog->replace_instruction(ins, cpu_op{ins->get_operator()}, ins->inputs());
}
template <class T>
void apply_simple_op(instruction_ref ins)
{
......
#include <migraphx/cpu/target.hpp>
#include <migraphx/cpu/lowering.hpp>
#include <migraphx/pass.hpp>
#include <migraphx/auto_contiguous.hpp>
#include <migraphx/rewrite_rnn.hpp>
#include <migraphx/dead_code_elimination.hpp>
......@@ -13,8 +14,9 @@ std::string target::name() const { return "cpu"; }
std::vector<pass> target::get_passes(migraphx::context&) const
{
return {auto_contiguous{},
rewrite_rnn{},
return {rewrite_rnn{},
dead_code_elimination{},
auto_contiguous{},
dead_code_elimination{},
lowering{},
dead_code_elimination{}};
......
......@@ -26,11 +26,14 @@ add_library(migraphx_device
device/atan.cpp
device/add_relu.cpp
device/contiguous.cpp
device/logsoftmax.cpp
device/convert.cpp
device/mul.cpp
device/concat.cpp
device/pad.cpp
device/gather.cpp
device/sub.cpp
device/clip.cpp
)
set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
rocm_clang_tidy_check(migraphx_device)
......@@ -48,6 +51,7 @@ add_library(migraphx_gpu
pooling.cpp
convolution.cpp
softmax.cpp
logsoftmax.cpp
contiguous.cpp
concat.cpp
relu.cpp
......@@ -62,6 +66,9 @@ add_library(migraphx_gpu
pad.cpp
gather.cpp
lrn.cpp
schedule_model.cpp
adjust_allocation.cpp
clip.cpp
)
set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
rocm_clang_tidy_check(migraphx_gpu)
......
#include <migraphx/gpu/abs.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -10,8 +7,8 @@ namespace gpu {
shape miopen_abs::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(2).not_broadcasted();
return inputs.at(1);
check_shapes{inputs, *this}.has(2).packed();
return inputs.at(0);
}
argument miopen_abs::compute(context& ctx,
......
#include <migraphx/gpu/adjust_allocation.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/program.hpp>
#include <migraphx/iterator_for.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
void adjust_allocation::apply(program& p) const
{
for(auto ins : iterator_for(p))
{
// skip instruction with no input
if(ins->inputs().empty())
continue;
if(ins->name() == "load")
continue;
auto alias_ins = instruction::get_output_alias(ins, true);
if(alias_ins->name() == "hip::allocate")
{
// shape allocated is different from actual shape
// of the instruction, reallocate and replace the previous one
if(alias_ins->get_shape() != ins->get_shape())
{
auto alloc_ins = p.insert_instruction(ins, hip_allocate{ins->get_shape()});
p.replace_instruction(alias_ins, alloc_ins);
}
}
}
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/batchnorm.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
#include <migraphx/gpu/clip.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/clip.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape hip_clip::compute_shape(std::vector<shape> inputs) const
{
inputs.pop_back();
return op.compute_shape(inputs);
}
argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::clip(ctx.get_stream().get(), args.back(), args.front(), op.max_val, op.min_val);
return args.back();
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/gpu/concat.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/concat.hpp>
#include <utility>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
#include <migraphx/gpu/contiguous.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/contiguous.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
#include <migraphx/generate.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -23,19 +21,21 @@ argument miopen_convolution::compute(context& ctx,
float alpha = 1;
float beta = 0;
miopenConvolutionForward(ctx.get_stream().get_miopen(),
&alpha,
x_desc.get(),
args[0].implicit(),
w_desc.get(),
args[1].implicit(),
cd.get(),
algo,
&beta,
y_desc.get(),
args[3].implicit(),
args[2].implicit(),
args[2].get_shape().bytes());
auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
&alpha,
x_desc.get(),
args[0].implicit(),
w_desc.get(),
args[1].implicit(),
cd.get(),
algo,
&beta,
y_desc.get(),
args[3].implicit(),
args[2].implicit(),
args[2].get_shape().bytes());
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("Running convolution failed");
return args[3];
}
......@@ -91,8 +91,11 @@ void miopen_convolution::finalize(context& ctx,
{
if(handle == ctx.get_stream().get_miopen())
return;
// TODO: Check that workspace hasn't changed
compile(ctx, output_shape, std::move(inputs));
// Check that workspace hasn't changed
auto size = inputs.at(2).bytes();
auto ws = compile(ctx, output_shape, std::move(inputs));
if(ws.bytes() > size)
MIGRAPHX_THROW("Workspace has changed during finalization.");
}
} // namespace gpu
......
#include <migraphx/gpu/device/clip.hpp>
#include <migraphx/gpu/device/nary.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void clip(hipStream_t stream,
const argument& result,
const argument& arg1,
const float max,
const float min)
{
nary(stream, result, arg1)(
[max, min](auto x) { return std::min<decltype(x)>(std::max<decltype(x)>(min, x), max); });
}
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment