Commit 17f4ba28 authored by Paul's avatar Paul
Browse files

Merge branch 'jit-vector-reduce' into jit-vector-softmax

parents a8a8d868 c84154b8
......@@ -70,19 +70,19 @@ struct find_reshaper
match::any_of[match::outputs()](match::name(reshaper_names())));
}
void apply(module& p, const match::matcher_result& mr) const
void apply(module& m, const match::matcher_result& mr) const
{
auto ins = mr.result;
std::vector<instruction_ref> reshapes{ins};
while(is_reshaper(reshapes.back()))
{
assert(!reshapes.back()->inputs().empty());
assert(p.has_instruction(reshapes.back()->inputs().front()));
assert(m.has_instruction(reshapes.back()->inputs().front()));
auto input = reshapes.back()->inputs().front();
reshapes.push_back(input);
}
std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
std::pair<instruction_ref, instruction_ref> r{m.end(), m.end()};
for(auto start : iterator_for(reshapes))
{
auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
......@@ -96,7 +96,7 @@ struct find_reshaper
}
if(r.first != r.second)
{
p.replace_instruction(r.first, r.second);
m.replace_instruction(r.first, r.second);
}
}
};
......@@ -117,10 +117,10 @@ struct find_nop_reshapes
return match::name(reshapes)(match::same_shape(match::arg(0)));
}
void apply(module& p, const match::matcher_result& mr) const
void apply(module& m, const match::matcher_result& mr) const
{
auto ins = mr.result;
p.replace_instruction(ins, ins->inputs().front());
m.replace_instruction(ins, ins->inputs().front());
}
};
......@@ -132,7 +132,7 @@ struct find_transpose
match::skip_output(match::name("contiguous"))(match::name("transpose"))));
}
void apply(module& p, const match::matcher_result& mr) const
void apply(module& m, const match::matcher_result& mr) const
{
auto ins = mr.result;
auto x = ins;
......@@ -149,11 +149,11 @@ struct find_transpose
return;
if(is_no_transpose(dims))
{
p.replace_instruction(ins, t->inputs().front());
m.replace_instruction(ins, t->inputs().front());
}
else
{
p.replace_instruction(
m.replace_instruction(
ins, make_op("transpose", {{"permutation", dims}}), t->inputs().front());
}
}
......@@ -223,7 +223,7 @@ struct find_nested_slice
return result;
}
void apply(module& p, const match::matcher_result& mr) const
void apply(module& m, const match::matcher_result& mr) const
{
auto ins = mr.result;
auto slice = ins->inputs().front();
......@@ -241,7 +241,7 @@ struct find_nested_slice
op.starts.push_back(pp.second.first);
op.ends.push_back(pp.second.second);
}
p.replace_instruction(ins, op, input);
m.replace_instruction(ins, op, input);
}
};
......@@ -252,7 +252,7 @@ struct find_concat_transpose
return match::name("concat")(match::all_of[match::inputs()](match::transpose_shape()));
}
void apply(module& p, const match::matcher_result& mr) const
void apply(module& m, const match::matcher_result& mr) const
{
auto ins = mr.result;
auto trans_inputs = ins->inputs();
......@@ -279,14 +279,14 @@ struct find_concat_transpose
std::vector<instruction_ref> inputs;
std::transform(
ins->inputs().begin(), ins->inputs().end(), std::back_inserter(inputs), [&](auto i) {
return p.insert_instruction(
return m.insert_instruction(
ins, make_op("transpose", {{"permutation", permutation}}), i);
});
auto concat = p.insert_instruction(ins, op, inputs);
auto t = p.insert_instruction(
auto concat = m.insert_instruction(ins, op, inputs);
auto t = m.insert_instruction(
ins, make_op("transpose", {{"permutation", ipermutation}}), concat);
assert(ins->get_shape().lens() == t->get_shape().lens());
p.replace_instruction(ins, t);
m.replace_instruction(ins, t);
}
};
......@@ -303,7 +303,7 @@ struct find_nested_concat
return op.axis;
}
void apply(module& p, const match::matcher_result& mr) const
void apply(module& m, const match::matcher_result& mr) const
{
auto ins = mr.result;
auto axis = get_axis(ins);
......@@ -317,7 +317,7 @@ struct find_nested_concat
args.push_back(i);
}
})(ins->inputs());
p.replace_instruction(ins, ins->get_operator(), args);
m.replace_instruction(ins, ins->get_operator(), args);
}
};
......@@ -329,7 +329,7 @@ struct find_resize
match::args(match::name("reshape").bind("data"), match::is_constant().bind("ind")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto ins_rsp = r.instructions["data"];
......@@ -417,13 +417,13 @@ struct find_resize
}
auto in_rsp = ins_rsp->inputs().front();
auto rsp_data = p.insert_instruction(
auto rsp_data = m.insert_instruction(
ins_rsp, migraphx::make_op("reshape", {{"dims", in_dims}}), in_rsp);
auto mb_rsp = p.insert_instruction(
auto mb_rsp = m.insert_instruction(
ins_rsp, migraphx::make_op("multibroadcast", {{"out_lens", out_dims}}), rsp_data);
auto std_mb = p.insert_instruction(ins, migraphx::make_op("contiguous"), mb_rsp);
auto std_mb = m.insert_instruction(ins, migraphx::make_op("contiguous"), mb_rsp);
std::vector<int64_t> rsp_dims(out_lens.begin(), out_lens.end());
p.replace_instruction(ins, migraphx::make_op("reshape", {{"dims", rsp_dims}}), std_mb);
m.replace_instruction(ins, migraphx::make_op("reshape", {{"dims", rsp_dims}}), std_mb);
}
};
......@@ -436,7 +436,7 @@ struct find_where_op
match::is_constant().bind("ind")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto concat = r.instructions["data"];
......@@ -475,11 +475,11 @@ struct find_where_op
if(val)
{
p.replace_instruction(ins, inputs.at(0));
m.replace_instruction(ins, inputs.at(0));
}
else
{
p.replace_instruction(ins, inputs.at(1));
m.replace_instruction(ins, inputs.at(1));
}
}
};
......@@ -496,7 +496,7 @@ struct find_reshape_cont
match::any()));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto ins_cont = r.instructions["cont"];
......@@ -530,11 +530,11 @@ struct find_reshape_cont
else
{
inputs.push_back(
p.insert_instruction(ins, make_op("reshape", {{"dims", dims}}), in));
m.insert_instruction(ins, make_op("reshape", {{"dims", dims}}), in));
}
}
auto out = p.insert_instruction(ins, ins->get_operator(), inputs);
p.replace_instruction(ins, make_op("reshape", {{"dims", out_dims}}), out);
auto out = m.insert_instruction(ins, ins->get_operator(), inputs);
m.replace_instruction(ins, make_op("reshape", {{"dims", out_dims}}), out);
}
};
......@@ -564,25 +564,25 @@ struct find_transpose_contiguous_reshaper_unary
match::args(match_transpose_contiguous_reshaper()));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto reshaper_ins = r.instructions["reshaper_ins"];
auto trans_ins = r.instructions["trans_ins"];
auto cont_ins = r.instructions["cont_ins"];
auto unary_op_name = ins->get_operator().name();
auto unary_ins = p.insert_instruction(cont_ins, make_op(unary_op_name), trans_ins);
auto new_cont_ins = p.insert_instruction(cont_ins, make_op("contiguous"), unary_ins);
auto unary_ins = m.insert_instruction(cont_ins, make_op(unary_op_name), trans_ins);
auto new_cont_ins = m.insert_instruction(cont_ins, make_op("contiguous"), unary_ins);
// older cont and reshape are removed by deadcode elimination
p.replace_instruction(ins, reshaper_ins->get_operator(), new_cont_ins);
m.replace_instruction(ins, reshaper_ins->get_operator(), new_cont_ins);
}
};
void simplify_reshapes::apply(module& p) const
void simplify_reshapes::apply(module& m) const
{
for(int i = 0; i < 2; i++)
{
match::find_matches(p,
match::find_matches(m,
find_where_op{},
find_resize{},
find_reshape_cont{},
......@@ -594,7 +594,7 @@ void simplify_reshapes::apply(module& p) const
find_nested_slice{},
find_nested_concat{},
find_transpose_contiguous_reshaper_unary{});
dead_code_elimination{}.apply(p);
dead_code_elimination{}.apply(m);
}
}
......
......@@ -352,7 +352,7 @@ struct cpu_apply
std::transform(bind_inputs.begin(),
bind_inputs.end(),
std::back_inserter(inputs),
[&](const auto& s) { return r.instructions.at(s); });
[&](const auto& s) { return r.instructions[s]; });
inputs.push_back(this->insert_allocation(ins, ins->get_shape()));
modl->replace_instruction(ins, op, inputs);
});
......
......@@ -131,6 +131,7 @@ add_library(migraphx_gpu
clip.cpp
code_object_op.cpp
compile_ops.cpp
compile_gen.cpp
compile_hip.cpp
compile_hip_code_object.cpp
compiler.cpp
......@@ -158,6 +159,7 @@ add_library(migraphx_gpu
nonzero.cpp
pack_args.cpp
pack_int8_args.cpp
prefuse_ops.cpp
pad.cpp
pooling.cpp
quant_convolution.cpp
......
......@@ -28,30 +28,30 @@ struct hip_stream_model
bool is_wait(migraphx::instruction_ref ins) const { return ins->name() == "gpu::wait_event"; }
};
stream_model make_stream_model(const module& p)
stream_model make_stream_model(const module& m)
{
hip_stream_model m;
hip_stream_model hsm;
std::size_t stream = 0;
for(auto ins : iterator_for(p))
for(auto ins : iterator_for(m))
{
if(ins->name() == "gpu::set_stream")
{
auto v = ins->get_operator().to_value();
stream = v["stream"].to<std::size_t>();
m.max_stream = std::max(stream, m.max_stream);
auto v = ins->get_operator().to_value();
stream = v["stream"].to<std::size_t>();
hsm.max_stream = std::max(stream, hsm.max_stream);
}
if(ins->get_operator().is_context_free())
continue;
if(contains({"hip::hip_allocate_memory", "hip::hip_copy_literal", "@param"}, ins->name()))
continue;
m.ins2stream[ins] = stream;
hsm.ins2stream[ins] = stream;
}
return m;
return hsm;
}
std::vector<stream_race> analyze_streams(const module& p)
std::vector<stream_race> analyze_streams(const module& m)
{
return migraphx::analyze_streams(p, make_stream_model(p));
return migraphx::analyze_streams(m, make_stream_model(m));
}
} // namespace gpu
......
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/shape.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/stringutils.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace gen {
static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
{
// If all inputs is half then only use half2
if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
return s.type() == shape::half_type;
}))
return {2};
return {4, 2};
}
vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
{
auto sizes = vector_sizes(inputs);
std::vector<std::size_t> max_vec_size;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(max_vec_size),
[&](const auto& input) -> std::size_t {
auto stride = input.strides()[axis];
auto len = input.lens()[axis];
if(stride != 0 and stride != 1)
return 1;
if(len == 1 and input.elements() > sizes.front())
return sizes.front();
auto it = std::find_if(
sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
if(it != sizes.end())
return *it;
return 1;
});
return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
}
std::string vectorize::str() const
{
return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
}
preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
{
const std::size_t max_lds_bytes = 4096;
std::vector<bool> result;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(result),
[&](const shape& input) { return input.strides()[axis] == 0; });
auto bytes = std::inner_product(inputs.begin(),
inputs.end(),
result.begin(),
std::size_t{0},
std::plus<>{},
[](const shape& s, bool b) -> std::size_t {
if(b)
return s.bytes();
return 0;
});
if(bytes < max_lds_bytes)
return {result};
// TODO: Try to partially preload items
std::fill(result.begin(), result.end(), false);
return {result};
}
std::string preload::str() const
{
std::vector<std::string> bool_strs;
std::transform(args.begin(), std::prev(args.end()), std::back_inserter(bool_strs), [](bool b) {
if(b)
return "true";
return "false";
});
return "auto_preload<false, " + join_strings(bool_strs, ", ") + ">(idx)";
}
bool preload::is_preloading() const
{
return std::accumulate(args.begin(), args.end(), false, std::logical_or<>{});
}
std::size_t find_fast_axis(const std::vector<shape>& inputs)
{
auto permutation = find_permutation(inputs);
auto it = std::max_element(permutation.begin(), permutation.end());
return it - permutation.begin();
}
std::string make_transformer_args(std::vector<std::string> transformers)
{
return join_strings(std::move(transformers), ", ");
}
} // namespace gen
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -119,8 +119,21 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
};
}
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
{
size_t block_size = 128;
while(block_size <= max_block_size and block_size <= n)
block_size *= 2;
return block_size / 2;
}
operation compile_hip_code_object(const std::string& content, hip_compile_options options)
{
assert(options.global > 0);
assert(options.local > 0);
assert(not options.inputs.empty());
assert(options.inputs.size() == options.virtual_inputs.size() or
options.virtual_inputs.empty());
std::vector<src_file> srcs;
std::transform(migraphx_kernels().begin(),
migraphx_kernels().end(),
......
......@@ -11,11 +11,11 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
void eliminate_workspace::apply(module& p) const
void eliminate_workspace::apply(module& m) const
{
std::size_t n = 0;
std::vector<instruction_ref> allocs;
for(auto ins : iterator_for(p))
for(auto ins : iterator_for(m))
{
if(ins->outputs().size() != 1)
continue;
......@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const
}
if(n > 0)
{
auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}});
auto ws = m.add_parameter("workspace", shape{shape::int8_type, {n}});
for(auto&& a : allocs)
{
p.replace_instruction(a, ws);
p.remove_instruction(a);
m.replace_instruction(a, ws);
m.remove_instruction(a);
}
}
}
......
......@@ -316,7 +316,7 @@ struct find_layernorm
{
auto matcher() const { return match::layernorm(&gpu_name); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
......@@ -331,7 +331,7 @@ struct find_layernorm
if(relements > 1024 or (relements % 4 != 0 and relements > 256))
return;
p.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
m.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
}
};
......@@ -343,11 +343,11 @@ struct find_triadd_layernorm
match::used_once(), match::all_of[match::inputs()](match::standard_shape()))));
}
void apply(module& p, const match::matcher_result& r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto triadd = ins->inputs().front();
p.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
m.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
}
};
......@@ -355,13 +355,13 @@ struct find_gelu
{
auto matcher() const { return match::gelu_erf(&gpu_name); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
p.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
}
};
......@@ -372,7 +372,7 @@ struct find_add_gelu
return match::name("gpu::gelu")(match::arg(0)(match::name("gpu::add").bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -381,7 +381,7 @@ struct find_add_gelu
move_broadcasted_back(args);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_add_gelu{}, args);
m.replace_instruction(ins, hip_add_gelu{}, args);
}
};
......@@ -391,16 +391,16 @@ struct find_gelu_new
auto matcher() const { return match::gelu_tanh(&gpu_name); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
if(fast_math)
p.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
else
p.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
m.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
}
};
......@@ -411,7 +411,7 @@ struct find_add_gelu_new
return match::name("gpu::gelu_new")(match::arg(0)(match::name("gpu::add").bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -420,7 +420,7 @@ struct find_add_gelu_new
move_broadcasted_back(args);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_add_gelu_new{}, args);
m.replace_instruction(ins, hip_add_gelu_new{}, args);
}
};
......@@ -435,7 +435,7 @@ struct find_add_clip
.bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -448,9 +448,9 @@ struct find_add_clip
add_args.pop_back();
add_args.insert(add_args.end(), std::next(ins_args.begin()), ins_args.end());
if(add_ins->name() == "gpu::add")
p.replace_instruction(ins, hip_add_clip{}, add_args);
m.replace_instruction(ins, hip_add_clip{}, add_args);
else if(add_ins->name() == "gpu::triadd")
p.replace_instruction(ins, hip_triadd_clip{}, add_args);
m.replace_instruction(ins, hip_triadd_clip{}, add_args);
}
};
......@@ -470,7 +470,7 @@ struct find_add_unary
.bind("add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
......@@ -481,9 +481,9 @@ struct find_add_unary
// Use the allocation from the relu operator
args.back() = ins->inputs().back();
if(add_ins->name() == "gpu::add")
p.replace_instruction(ins, binary_add_op, args);
m.replace_instruction(ins, binary_add_op, args);
else if(add_ins->name() == "gpu::triadd")
p.replace_instruction(ins, ternary_add_op, args);
m.replace_instruction(ins, ternary_add_op, args);
}
};
......@@ -498,7 +498,7 @@ struct find_triadd
.bind("input")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto input_ins = r.instructions["input"];
......@@ -513,7 +513,7 @@ struct find_triadd
move_broadcasted_back(args);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_triadd{}, args);
m.replace_instruction(ins, hip_triadd{}, args);
}
};
......@@ -525,7 +525,7 @@ struct find_mul_add
match::name("gpu::mul")(match::used_once()).bind("mul"), match::any().bind("b")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto mul_ins = r.instructions["mul"];
auto b_ins = r.instructions["b"];
......@@ -538,7 +538,7 @@ struct find_mul_add
args.insert(std::prev(args.end()), b_ins);
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_mul_add{}, args);
m.replace_instruction(ins, hip_mul_add{}, args);
}
};
......@@ -550,7 +550,7 @@ struct find_mul_add_relu
match::arg(0)(match::name("gpu::mul_add")(match::used_once()).bind("mul_add")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto mul_add_ins = r.instructions["mul_add"];
auto ins = r.result;
......@@ -558,7 +558,7 @@ struct find_mul_add_relu
// Use the allocation from the relu operator
args.back() = ins->inputs().back();
p.replace_instruction(ins, hip_mul_add_relu{}, args);
m.replace_instruction(ins, hip_mul_add_relu{}, args);
}
};
......@@ -783,7 +783,7 @@ auto conv_bias(Ms... ms)
}
template <class Op>
void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
{
auto conv_ins = r.instructions["conv"];
auto bias_ins = r.instructions["bias"];
......@@ -798,7 +798,7 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
// TODO: Insert ws allocation
auto ws = cb.get_workspace(ctx);
(void)ws;
p.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
m.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
}
inline auto precompile_name(std::string s) // NOLINT
......@@ -829,9 +829,9 @@ struct find_conv_bias
match::output(match::name(std::unordered_set<std::string>{"gpu::relu"}))));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
apply_conv_bias<miopen_conv_bias>(*ctx, p, std::move(r));
apply_conv_bias<miopen_conv_bias>(*ctx, m, r);
}
};
......@@ -840,9 +840,9 @@ struct find_conv_bias_relu
context* ctx = nullptr;
auto matcher() const { return match::name("gpu::relu")(match::arg(0)(conv_bias())); }
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
apply_conv_bias<miopen_conv_bias_relu>(*ctx, p, std::move(r));
apply_conv_bias<miopen_conv_bias_relu>(*ctx, m, r);
}
};
......@@ -857,7 +857,7 @@ struct find_conv_pointwise
fusable_conv(match::used_once()).bind("conv")));
}
void apply(module& m, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto conv_ins = r.instructions["conv"];
auto bias_ins = r.instructions["bias"];
......@@ -896,7 +896,7 @@ struct find_gemm_add
match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
}
void apply(module& p, match::matcher_result r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto gemm_ins = r.instructions["gemm"];
......@@ -919,15 +919,15 @@ struct find_gemm_add
auto copy_ins = c_ins;
// Insert copy
if(ins == p.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
if(ins == m.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
{
copy_ins = p.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
copy_ins = m.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
}
inputs.push_back(copy_ins);
inputs.push_back(copy_ins);
gemm.beta = 1;
p.replace_instruction(ins, gemm, inputs);
m.replace_instruction(ins, gemm, inputs);
}
};
......@@ -938,22 +938,22 @@ struct find_commutative_broadcast
return match::name("gpu::add", "gpu::mul")(match::arg(1)(match::broadcast_shape()));
}
void apply(module& p, const match::matcher_result& r) const
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto args = ins->inputs();
move_broadcasted_back(args);
p.replace_instruction(ins, ins->get_operator(), args);
m.replace_instruction(ins, ins->get_operator(), args);
}
};
void fuse_ops::apply(module& p) const
void fuse_ops::apply(module& m) const
{
match::find_matches(p, find_gelu{}, find_gelu_new{fast_math});
run_passes(p, {dead_code_elimination{}});
match::find_matches(p, find_triadd{});
match::find_matches(p,
match::find_matches(m, find_gelu{}, find_gelu_new{fast_math});
run_passes(m, {dead_code_elimination{}});
match::find_matches(m, find_triadd{});
match::find_matches(m,
find_layernorm{},
find_conv_pointwise{ctx},
find_conv_bias_relu{ctx},
......@@ -966,8 +966,8 @@ void fuse_ops::apply(module& p) const
find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}},
find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
find_add_clip{});
run_passes(p, {dead_code_elimination{}});
match::find_matches(p, find_triadd_layernorm{}, find_gemm_add{}, find_commutative_broadcast{});
run_passes(m, {dead_code_elimination{}});
match::find_matches(m, find_triadd_layernorm{}, find_gemm_add{}, find_commutative_broadcast{});
}
} // namespace gpu
......
......@@ -11,7 +11,7 @@ struct module;
namespace gpu {
std::vector<stream_race> analyze_streams(const module& p);
std::vector<stream_race> analyze_streams(const module& m);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#include <migraphx/config.hpp>
#include <string>
#include <unordered_map>
#include <vector>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct shape;
namespace gpu {
namespace gen {
struct vectorize
{
std::size_t size = 1;
std::size_t axis = 0;
static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
std::string str() const;
};
struct preload
{
std::vector<bool> args = {};
static preload broadcasts(std::size_t axis, const std::vector<shape>& inputs);
bool is_preloading() const;
std::string str() const;
};
std::size_t find_fast_axis(const std::vector<shape>& inputs);
std::string make_transformer_args(std::vector<std::string> transformers);
template <class... Ts>
std::string make_transformer_args(Ts... xs)
{
return make_transformer_args({xs.str()...});
}
} // namespace gen
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
......@@ -46,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
operation compile_hip_code_object(const std::string& content, hip_compile_options options);
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
......@@ -14,7 +14,7 @@ namespace gpu {
struct eliminate_workspace
{
std::string name() const { return "eliminate_workspace"; }
void apply(module& p) const;
void apply(module& m) const;
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -16,7 +16,7 @@ struct fuse_ops
context* ctx = nullptr;
bool fast_math = true;
std::string name() const { return "gpu::fuse_ops"; }
void apply(module& p) const;
void apply(module& m) const;
};
} // namespace gpu
......
#ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#include <migraphx/config.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct module;
namespace gpu {
struct prefuse_ops
{
std::string name() const { return "gpu::prefuse_ops"; }
void apply(module& m) const;
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
......@@ -17,9 +17,9 @@ struct schedule_model
{
std::size_t streams = 0;
std::size_t concurrency() const;
void sched(module& p, instruction_ref ins, std::size_t n) const;
void wait(module& p, instruction_ref ins, std::size_t wait_id) const;
void record(module& p, instruction_ref ins, std::size_t wait_id) const;
void sched(module& m, instruction_ref ins, std::size_t n) const;
void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
void record(module& m, instruction_ref ins, std::size_t wait_id) const;
std::size_t weight(const operation& op) const;
};
......
......@@ -15,7 +15,7 @@ namespace gpu {
struct sync_device
{
std::string name() const { return "sync_device"; }
void apply(module& p) const;
void apply(module& m) const;
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -14,7 +14,7 @@ struct write_literals
context* ctx = nullptr;
std::string name() const { return "gpu::write_literals"; }
void apply(module& p) const;
void apply(module& m) const;
};
} // namespace gpu
......
......@@ -2,6 +2,7 @@
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp>
......@@ -17,6 +18,8 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
using namespace migraphx::gpu::gen; // NOLINT
static const char* const pointwise_kernel = R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/pointwise.hpp>
......@@ -30,7 +33,7 @@ extern "C" {
__global__ void kernel(${params})
{
auto idx = make_index();
pointwise(idx, auto_preload<${preloads}>(idx), vectorize<${vec_size}, ${axis}>())(${lambda}, ${args});
pointwise(idx, ${transformers})(${lambda}, ${args});
}
}
......@@ -50,75 +53,6 @@ struct pointwise_compiler : compiler<pointwise_compiler>
else
return 1;
}
static std::size_t find_fast_axis(const std::vector<shape>& inputs)
{
auto permutation = find_permutation(inputs);
auto it = std::max_element(permutation.begin(), permutation.end());
return it - permutation.begin();
}
static std::vector<bool> preload(std::size_t axis, const std::vector<shape>& inputs)
{
const std::size_t max_lds_bytes = 4096;
std::vector<bool> result;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(result),
[&](const shape& input) { return input.strides()[axis] == 0; });
auto bytes = std::inner_product(inputs.begin(),
inputs.end(),
result.begin(),
std::size_t{0},
std::plus<>{},
[](const shape& s, bool b) -> std::size_t {
if(b)
return s.bytes();
return 0;
});
if(bytes < max_lds_bytes)
return result;
// TODO: Try to partially preload items
std::fill(result.begin(), result.end(), false);
return result;
}
static std::string preload_str(const std::vector<bool>& bs)
{
std::vector<std::string> bool_strs;
std::transform(bs.begin(), std::prev(bs.end()), std::back_inserter(bool_strs), [](bool b) {
if(b)
return "true";
return "false";
});
return "false, " + join_strings(bool_strs, ", ");
}
static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
{
// If all inputs is half then only use half2
if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
return s.type() == shape::half_type;
}))
return {2};
return {4, 2};
}
static auto vectorize_elements(std::size_t axis, const std::vector<shape>& inputs)
{
auto sizes = vector_sizes(inputs);
std::vector<std::size_t> max_vec_size;
std::transform(inputs.begin(),
inputs.end(),
std::back_inserter(max_vec_size),
[&](const auto& input) -> std::size_t {
auto stride = input.strides()[axis];
auto len = input.lens()[axis];
if(stride != 0 and stride != 1)
return 1;
auto it = std::find_if(
sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
if(it != sizes.end())
return *it;
return 1;
});
return *std::min_element(max_vec_size.begin(), max_vec_size.end());
}
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{
hip_compile_options options;
......@@ -127,21 +61,18 @@ struct pointwise_compiler : compiler<pointwise_compiler>
options.virtual_inputs = reduce_dims(inputs);
options.params = "-Wno-float-equal";
auto axis = find_fast_axis(options.virtual_inputs);
auto vec_size = vectorize_elements(axis, options.virtual_inputs);
auto preloads = preload(axis, options.virtual_inputs);
auto is_preloading =
std::accumulate(preloads.begin(), preloads.end(), false, std::logical_or<>{});
options.set_launch_params(v,
compute_global_for(ctx,
options.output.elements() / vec_size,
oversubscribe_if(not is_preloading)));
auto vec = vectorize::elements(axis, options.virtual_inputs);
auto preloads = preload::broadcasts(axis, options.virtual_inputs);
options.set_launch_params(
v,
compute_global_for(ctx,
options.output.elements() / vec.size,
oversubscribe_if(not preloads.is_preloading())));
auto src = interpolate_string(pointwise_kernel,
{{"params", enum_params(inputs.size(), "void * private_p")},
{"args", enum_params(inputs.size(), "private_p")},
{"lambda", v.at("lambda").to<std::string>()},
{"vec_size", std::to_string(vec_size)},
{"axis", std::to_string(axis)},
{"preloads", preload_str(preloads)},
{"transformers", make_transformer_args(preloads, vec)},
{"preamble", v.get("preamble", std::string{})}});
return compile_hip_code_object(src, options);
}
......
......@@ -2,6 +2,7 @@
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp>
......@@ -16,9 +17,12 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
using namespace migraphx::gpu::gen; // NOLINT
static const char* const simple_reduce_kernel = R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/reduce.hpp>
#include <migraphx/kernels/vectorize.hpp>
#include <args.hpp>
namespace migraphx {
......@@ -26,9 +30,10 @@ namespace migraphx {
${preamble}
extern "C" {
__global__ void kernel(void* input_p, void* output_p)
__global__ void reduce_kernel(void* input_p, void* output_p)
{
make_tensors()(input_p, output_p)([](auto input, auto output) {
transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
simple_reduce<reduce::${algo}>(${reduction}, ${init}, input, output, ${read}, ${write});
});
......@@ -40,14 +45,6 @@ __global__ void kernel(void* input_p, void* output_p)
)__migraphx__";
constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024)
{
size_t block_size = 128;
while(block_size <= max_block_size and block_size <= n)
block_size *= 2;
return block_size / 2;
}
static std::size_t get_reduce_elements(const std::vector<shape>& inputs)
{
return inputs.front().elements() / inputs.back().elements();
......@@ -101,32 +98,42 @@ struct reduce_compiler : compiler<reduce_compiler>
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{
hip_compile_options options;
auto reduce_elements = get_reduce_elements(inputs);
auto algo = v.get("algo", get_reduce_algo(inputs));
options.inputs = inputs;
options.output = inputs.back();
options.virtual_inputs = reduce_dims(inputs);
auto faxis = find_fast_axis({options.virtual_inputs.front()});
vectorize vec{};
// Vectorize if the axis is a reduction axis
if(options.virtual_inputs.back().lens()[faxis] == 1)
{
vec = vectorize::elements(faxis, options.virtual_inputs);
}
auto relements = get_reduce_elements(options.virtual_inputs) / vec.size;
auto nelements = options.virtual_inputs.back().elements();
auto algo = v.get("algo", get_reduce_algo(options.virtual_inputs));
if(algo == "block")
{
auto block_size = compute_block_size(reduce_elements, 256);
auto block_size = compute_block_size(relements, 256);
options.set_launch_params(
v, compute_global_for(ctx, inputs.back().elements() * block_size, 256), block_size);
v, compute_global_for(ctx, nelements * block_size, 256), block_size);
}
else if(algo == "lane")
{
options.set_launch_params(v, compute_global_for(ctx, inputs.back().elements(), 256));
options.set_launch_params(v, compute_global_for(ctx, nelements, 256));
}
else
{
MIGRAPHX_THROW("Unknown reduce algo: " + algo);
}
options.inputs = inputs;
options.output = inputs.back();
options.virtual_inputs = reduce_dims(inputs);
std::string identity = "[](auto x) { return x; }";
auto src = interpolate_string(simple_reduce_kernel,
options.kernel_name = "reduce_kernel";
std::string identity = "[](auto x) { return x; }";
auto src = interpolate_string(simple_reduce_kernel,
{{"reduction", v.at("reduction").to<std::string>()},
{"init", v.get("init", std::string{"0"})},
{"read", v.get("read", identity)},
{"write", v.get("write", identity)},
{"algo", algo},
{"transformers", make_transformer_args(vec)},
{"preamble", v.get("preamble", std::string{})}});
options.params += "-Wno-float-equal";
return compile_hip_code_object(src, options);
......
......@@ -59,6 +59,8 @@ void launch_kernel(hipFunction_t fun,
void* kernargs,
std::size_t size)
{
assert(global > 0);
assert(local > 0);
void* config[] = {
// HIP_LAUNCH_PARAM_* are macros that do horrible things
#ifdef MIGRAPHX_USE_CLANG_TIDY
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment