Commit e7f7ea10 authored by Alan Turner's avatar Alan Turner
Browse files

Merge remote-tracking branch 'origin/optimize' into ck-gsg

parents 9c6ba1ed ae13eb93
...@@ -827,7 +827,7 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins) ...@@ -827,7 +827,7 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins)
}; };
auto dots = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot")); auto dots = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot"));
auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution")); auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution"));
return not(dots < 2 and convs < 2); return (dots >= 2 or convs >= 2);
} }
struct find_conv_dot_horiz_fusion struct find_conv_dot_horiz_fusion
...@@ -933,6 +933,73 @@ struct find_div_const ...@@ -933,6 +933,73 @@ struct find_div_const
} }
}; };
struct find_unit_ops
{
auto matcher() const
{
auto mul_1 = match::name("mul")(
match::either_arg(0, 1)(match::has_value(1.0f), match::any().bind("x")));
auto div_1 =
match::name("div")(match::args(match::any().bind("x"), match::has_value(1.0f)));
auto add_0 = match::name("add")(
match::either_arg(0, 1)(match::has_value(0.0f, 1e-12), match::any().bind("x")));
auto sub_0 =
match::name("sub")(match::args(match::any().bind("x"), match::has_value(0.0f)));
return match::any_of(mul_1, div_1, add_0, sub_0);
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto c_in = r.instructions["x"];
m.replace_instruction(ins, c_in);
}
};
struct find_neg_unit_ops
{
auto matcher() const
{
auto mul_neg_1 = match::name("mul")(
match::either_arg(0, 1)(match::has_value(-1.0f), match::any().bind("x")));
auto div_neg_1 =
match::name("div")(match::args(match::any().bind("x"), match::has_value(-1.0f)));
auto sub_0 =
match::name("sub")(match::args(match::has_value(0.0f), match::any().bind("x")));
return match::any_of(mul_neg_1, div_neg_1, sub_0);
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto c_in = r.instructions["x"];
auto neg = m.add_instruction(make_op("neg"), c_in);
m.replace_instruction(ins, neg);
}
};
struct find_zero_ops
{
auto matcher() const
{
auto mul_zero = match::name("mul")(
match::either_arg(0, 1)(match::has_value(0.0f).bind("x"), match::any()));
auto div_zero =
match::name("div")(match::args(match::has_value(0.0f).bind("x"), match::any()));
return match::any_of(mul_zero, div_zero);
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto zero_ins = r.instructions["x"];
m.replace_instruction(ins, zero_ins);
}
};
struct find_sub_const struct find_sub_const
{ {
auto matcher() const auto matcher() const
...@@ -1149,6 +1216,9 @@ void simplify_algebra::apply(module& m) const ...@@ -1149,6 +1216,9 @@ void simplify_algebra::apply(module& m) const
find_mul_conv{}, find_mul_conv{},
find_mul_slice_conv{}, find_mul_slice_conv{},
find_mul_add{}, find_mul_add{},
find_unit_ops{},
find_neg_unit_ops{},
find_zero_ops{},
find_dot_add{}, find_dot_add{},
find_div_const{}, find_div_const{},
find_sub_const{}, find_sub_const{},
......
...@@ -51,7 +51,18 @@ struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary> ...@@ -51,7 +51,18 @@ struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
auto r = s0; auto r = s0;
if(s0 != s1 or not s0.packed()) if(s0 != s1 or not s0.packed())
{ {
r = shape{s0.type(), s0.lens()}; if(s0.packed() != s1.packed())
{
r = s0.packed() ? s0 : s1;
}
else if(s0.broadcasted() != s1.broadcasted())
{
r = s0.broadcasted() ? s1.with_lens(s0.lens()) : s0.with_lens(s0.lens());
}
else
{
r = {s0.type(), s0.lens()};
}
} }
// Call to get_primitive to make sure an algo is available // Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs)); this->get_primitive(this->to_memory_desc(r, inputs));
......
...@@ -43,9 +43,9 @@ struct dnnl_convolution ...@@ -43,9 +43,9 @@ struct dnnl_convolution
return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)}; return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
} }
shape adjust_shape(const shape& x, int i) const shape adjust_shape(const shape& x, int i, const shape& output) const
{ {
auto s = base_adjust_shape(x); auto s = base_adjust_shape(x, output);
if(i == 1 and op.group > 1) if(i == 1 and op.group > 1)
{ {
// TODO: Add support for transposed weights // TODO: Add support for transposed weights
......
...@@ -37,9 +37,9 @@ struct dnnl_deconvolution ...@@ -37,9 +37,9 @@ struct dnnl_deconvolution
return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)}; return {MIGRAPHX_DNNL_PREFIX(ARG_SRC), MIGRAPHX_DNNL_PREFIX(ARG_WEIGHTS)};
} }
shape adjust_shape(const shape& x, int i) const shape adjust_shape(const shape& x, int i, const shape& output) const
{ {
auto s = base_adjust_shape(x); auto s = base_adjust_shape(x, output);
if(i == 1) if(i == 1)
{ {
// The input and output channels are flipped for dnnl // The input and output channels are flipped for dnnl
......
...@@ -167,7 +167,7 @@ struct dnnl_op : auto_register_op<Derived> ...@@ -167,7 +167,7 @@ struct dnnl_op : auto_register_op<Derived>
std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)); std::iota(result.begin(), result.end(), MIGRAPHX_DNNL_PREFIX(ARG_SRC_0));
return result; return result;
} }
shape base_adjust_shape(const shape& s) const shape base_adjust_shape(const shape& s, const shape& output) const
{ {
if(s.broadcasted()) if(s.broadcasted())
{ {
...@@ -183,7 +183,8 @@ struct dnnl_op : auto_register_op<Derived> ...@@ -183,7 +183,8 @@ struct dnnl_op : auto_register_op<Derived>
else else
return len; return len;
}); });
return shape{s.type(), lens}; // Use the permutation of the output
return output.with_lens(s.type(), lens);
} }
return s; return s;
} }
...@@ -204,7 +205,10 @@ struct dnnl_op : auto_register_op<Derived> ...@@ -204,7 +205,10 @@ struct dnnl_op : auto_register_op<Derived>
i++; i++;
} }
} }
shape adjust_shape(const shape& s, int) const { return base_adjust_shape(s); } shape adjust_shape(const shape& s, int, const shape& output) const
{
return base_adjust_shape(s, output);
}
std::vector<int> create_arg_map(std::size_t input_size) const std::vector<int> create_arg_map(std::size_t input_size) const
{ {
const auto& self = static_cast<const Derived&>(*this); const auto& self = static_cast<const Derived&>(*this);
...@@ -224,12 +228,12 @@ struct dnnl_op : auto_register_op<Derived> ...@@ -224,12 +228,12 @@ struct dnnl_op : auto_register_op<Derived>
const auto& self = static_cast<const Derived&>(*this); const auto& self = static_cast<const Derived&>(*this);
std::unordered_map<int, dnnl::memory::desc> result; std::unordered_map<int, dnnl::memory::desc> result;
result[MIGRAPHX_DNNL_PREFIX(ARG_DST)] = result[MIGRAPHX_DNNL_PREFIX(ARG_DST)] =
to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size())); to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size(), output_shape));
auto m = create_arg_map(inputs.size()); auto m = create_arg_map(inputs.size());
assert(m.size() >= inputs.size()); assert(m.size() >= inputs.size());
for(int i = 0; i < inputs.size(); i++) for(int i = 0; i < inputs.size(); i++)
{ {
result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i)); result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i, output_shape));
} }
return result; return result;
} }
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include <migraphx/instruction.hpp> #include <migraphx/instruction.hpp>
#include <migraphx/dfor.hpp> #include <migraphx/dfor.hpp>
#include <migraphx/op/identity.hpp> #include <migraphx/op/identity.hpp>
#include <migraphx/op/batch_norm_inference.hpp>
#include <migraphx/op/convolution.hpp> #include <migraphx/op/convolution.hpp>
#include <migraphx/op/deconvolution.hpp> #include <migraphx/op/deconvolution.hpp>
#include <migraphx/op/quant_convolution.hpp> #include <migraphx/op/quant_convolution.hpp>
...@@ -216,55 +215,6 @@ struct cpu_pad ...@@ -216,55 +215,6 @@ struct cpu_pad
}; };
MIGRAPHX_REGISTER_OP(cpu_pad) MIGRAPHX_REGISTER_OP(cpu_pad)
struct leaky_relu_op
{
op::leaky_relu op;
std::string name() const { return "cpu::leaky_relu"; }
auto fcn() const
{
auto a = op.alpha;
return [a](auto x) { return x > 0 ? x : x * a; };
}
};
template <typename Op>
struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
{
cpu_unary2() = default;
template <class T>
cpu_unary2(T pop) : op(Op{std::move(pop)})
{
}
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op.op, f);
}
std::string name() const { return op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(1);
const auto& s = inputs.at(0);
return {s.type(), s.lens()};
}
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0])([&](auto output, auto input) {
assert(input.get_shape().standard());
std::transform(input.begin(), input.end(), output.begin(), op.fcn());
});
return result;
}
};
template struct cpu_unary2<leaky_relu_op>;
struct cpu_rnn_var_sl_last_output struct cpu_rnn_var_sl_last_output
{ {
op::rnn_var_sl_last_output op; op::rnn_var_sl_last_output op;
......
...@@ -32,7 +32,7 @@ struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder> ...@@ -32,7 +32,7 @@ struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder>
{ {
std::string name() const { return "dnnl::reorder"; } std::string name() const { return "dnnl::reorder"; }
shape adjust_shape(const shape& x, int) const { return x; } shape adjust_shape(const shape& x, int, const shape&) const { return x; }
shape compute_shape(const std::vector<shape>& inputs) const shape compute_shape(const std::vector<shape>& inputs) const
{ {
......
...@@ -33,16 +33,15 @@ ...@@ -33,16 +33,15 @@
#include <migraphx/eliminate_data_type.hpp> #include <migraphx/eliminate_data_type.hpp>
#include <migraphx/eliminate_identity.hpp> #include <migraphx/eliminate_identity.hpp>
#include <migraphx/eliminate_pad.hpp> #include <migraphx/eliminate_pad.hpp>
#include <migraphx/layout_nhwc.hpp>
#include <migraphx/memory_coloring.hpp> #include <migraphx/memory_coloring.hpp>
#include <migraphx/propagate_constant.hpp> #include <migraphx/propagate_constant.hpp>
#include <migraphx/register_target.hpp> #include <migraphx/register_target.hpp>
#include <migraphx/replace_allocate.hpp> #include <migraphx/replace_allocate.hpp>
#include <migraphx/rewrite_batchnorm.hpp>
#include <migraphx/rewrite_pooling.hpp> #include <migraphx/rewrite_pooling.hpp>
#include <migraphx/rewrite_quantization.hpp> #include <migraphx/rewrite_quantization.hpp>
#include <migraphx/rewrite_rnn.hpp> #include <migraphx/rewrite_rnn.hpp>
#include <migraphx/schedule.hpp> #include <migraphx/schedule.hpp>
#include <migraphx/memory_coloring.hpp>
#include <migraphx/simplify_algebra.hpp> #include <migraphx/simplify_algebra.hpp>
#include <migraphx/simplify_qdq.hpp> #include <migraphx/simplify_qdq.hpp>
#include <migraphx/simplify_reshapes.hpp> #include <migraphx/simplify_reshapes.hpp>
...@@ -78,14 +77,15 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti ...@@ -78,14 +77,15 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
eliminate_identity{}, eliminate_identity{},
eliminate_pad{}, eliminate_pad{},
dead_code_elimination{}, dead_code_elimination{},
rewrite_batchnorm{},
dead_code_elimination{},
rewrite_rnn{}, rewrite_rnn{},
dead_code_elimination{}, dead_code_elimination{},
eliminate_common_subexpression{}, eliminate_common_subexpression{},
dead_code_elimination{}, dead_code_elimination{},
simplify_algebra{}, simplify_algebra{},
simplify_reshapes{}, simplify_reshapes{},
layout_nhwc{},
dead_code_elimination{},
simplify_reshapes{},
simplify_algebra{}, simplify_algebra{},
auto_contiguous{}, auto_contiguous{},
simplify_reshapes{}, simplify_reshapes{},
......
...@@ -78,15 +78,13 @@ add_library(migraphx_gpu ...@@ -78,15 +78,13 @@ add_library(migraphx_gpu
allocation_model.cpp allocation_model.cpp
argmax.cpp argmax.cpp
argmin.cpp argmin.cpp
batch_norm_inference.cpp
code_object_op.cpp code_object_op.cpp
compile_ops.cpp compile_ops.cpp
compile_gen.cpp compile_gen.cpp
compile_hip.cpp compile_hip.cpp
compile_hip_code_object.cpp compile_hip_code_object.cpp
compile_miopen.cpp
compiler.cpp compiler.cpp
convolution.cpp
deconvolution.cpp
device_name.cpp device_name.cpp
elu.cpp elu.cpp
fuse_ck.cpp fuse_ck.cpp
...@@ -103,7 +101,6 @@ add_library(migraphx_gpu ...@@ -103,7 +101,6 @@ add_library(migraphx_gpu
logsoftmax.cpp logsoftmax.cpp
loop.cpp loop.cpp
lrn.cpp lrn.cpp
leaky_relu.cpp
mlir.cpp mlir.cpp
multinomial.cpp multinomial.cpp
nonzero.cpp nonzero.cpp
...@@ -113,7 +110,6 @@ add_library(migraphx_gpu ...@@ -113,7 +110,6 @@ add_library(migraphx_gpu
pad.cpp pad.cpp
perfdb.cpp perfdb.cpp
pooling.cpp pooling.cpp
quant_convolution.cpp
reverse.cpp reverse.cpp
rnn_variable_seq_lens.cpp rnn_variable_seq_lens.cpp
rocblas.cpp rocblas.cpp
...@@ -148,16 +144,10 @@ register_migraphx_gpu_ops(hip_ ...@@ -148,16 +144,10 @@ register_migraphx_gpu_ops(hip_
) )
register_migraphx_gpu_ops(miopen_ register_migraphx_gpu_ops(miopen_
abs abs
batch_norm_inference
contiguous contiguous
convolution
deconvolution
elu
int8_conv_pack int8_conv_pack
leaky_relu
lrn lrn
pooling pooling
quant_convolution
) )
register_op(migraphx_gpu register_op(migraphx_gpu
HEADER migraphx/gpu/rnn_variable_seq_lens.hpp HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
...@@ -171,6 +161,9 @@ register_op(migraphx_gpu ...@@ -171,6 +161,9 @@ register_op(migraphx_gpu
HEADER migraphx/gpu/gemm.hpp HEADER migraphx/gpu/gemm.hpp
OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot> OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
INCLUDES migraphx/gpu/context.hpp) INCLUDES migraphx/gpu/context.hpp)
register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::deconvolution> gpu::miopen_convolution<op::quant_convolution>
INCLUDES migraphx/gpu/context.hpp)
rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION}) rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
rocm_clang_tidy_check(migraphx_gpu) rocm_clang_tidy_check(migraphx_gpu)
...@@ -245,14 +238,14 @@ endif() ...@@ -245,14 +238,14 @@ endif()
include(CheckLibraryExists) include(CheckLibraryExists)
get_target_property(MIOPEN_LOCATION MIOpen LOCATION) get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API) check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API) # check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
if(HAS_FIND_2_API) # if(HAS_FIND_2_API)
target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API) # target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
message(STATUS "MIGraphx is using Find-2.0 API of MIOpen") # message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
else() # else()
message(STATUS "MIOpen does not have Find-2.0 API") # message(STATUS "MIOpen does not have Find-2.0 API")
endif() # endif()
if(HAS_FIND_MODE_API) if(HAS_FIND_MODE_API)
target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API) target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)
......
...@@ -145,8 +145,7 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over) ...@@ -145,8 +145,7 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size) std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
{ {
const std::size_t min_block_size = 64; const std::size_t min_block_size = 64;
const std::size_t base_block_size = 32; auto block_size = (((n - 1) / min_block_size + 1)) * min_block_size;
auto block_size = (((n - 1) / base_block_size + 1)) * base_block_size;
return std::min(std::max(min_block_size, block_size), max_block_size); return std::min(std::max(min_block_size, block_size), max_block_size);
} }
......
...@@ -21,47 +21,81 @@ ...@@ -21,47 +21,81 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE. * THE SOFTWARE.
*/ */
#ifndef MIGRAPHX_GUARD_RTGLIB_DECONVOLUTION_HPP #include <migraphx/gpu/compile_miopen.hpp>
#define MIGRAPHX_GUARD_RTGLIB_DECONVOLUTION_HPP #include <migraphx/gpu/context.hpp>
#include <migraphx/module.hpp>
#include <migraphx/shape.hpp> #include <migraphx/iterator_for.hpp>
#include <migraphx/op/deconvolution.hpp> #include <migraphx/instruction.hpp>
#include <migraphx/gpu/miopen.hpp> #include <migraphx/make_op.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/op/identity.hpp>
#include <migraphx/gpu/rocblas.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
struct context; struct miopen_op
struct miopen_deconvolution
{ {
op::deconvolution op; operation op = op::identity{};
shared<convolution_descriptor> cd;
miopenConvFwdAlgorithm_t algo{};
uint64_t solution_id = 0;
template <class Self, class F> template <class Self, class F>
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
{ {
return pack_join(op::deconvolution::reflect(self.op, f), return pack(f(self.op, "op"));
pack(f(self.solution_id, "solution_id"))); }
std::string name() const { return "gpu::miopen_op"; }
shape compute_shape(std::vector<shape> inputs) const
{
inputs.push_back(inputs.back());
return op.compute_shape(inputs);
} }
std::string name() const { return "gpu::deconv"; }
shape compute_shape(const std::vector<shape>& inputs) const;
argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{ {
return shapes.size() - 1; return shapes.size() - 1;
} }
}; };
MIGRAPHX_REGISTER_OP(miopen_op);
std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool format) const
{
op.from_value({{"int8_x4_format", format}});
auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
return v.get<std::size_t>("workspace", 0);
}
void compile_miopen::apply(module& m) const
{
assert(ctx);
const bool int8_x4_format = get_int8_x4_format(any_cast<migraphx::gpu::context>(*ctx));
for(auto ins : iterator_for(m))
{
if(ins->name() != "gpu::miopen_op")
continue;
auto op = any_cast<miopen_op>(ins->get_operator()).op;
std::size_t ws = 0;
try
{
// for the regular convolution and deconvolution, this try would always succeed
ws = compile(op, ins, int8_x4_format);
}
catch(migraphx::exception&)
{
// In case no solver supports the default format, retry using the other format.
ws = compile(op, ins, not int8_x4_format);
}
auto inputs = ins->inputs();
auto alloc = m.insert_instruction(
ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
inputs.insert(std::prev(inputs.end()), alloc);
m.replace_instruction(ins, op, inputs);
}
}
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
#endif
...@@ -40,18 +40,25 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL); ...@@ -40,18 +40,25 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL);
struct precompile_op struct precompile_op
{ {
operation op = op::identity{}; operation op = op::identity{};
std::size_t additional_args = 1;
bool ignore_modules = false;
template <class Self, class F> template <class Self, class F>
static auto reflect(Self& self, F f) static auto reflect(Self& self, F f)
{ {
return pack(f(self.op, "op")); return pack(f(self.op, "op"),
f(self.additional_args, "additional_args"),
f(self.ignore_modules, "ignore_modules"));
} }
std::string name() const { return "gpu::precompile_op"; } std::string name() const { return "gpu::precompile_op"; }
shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
{ {
inputs.pop_back(); // Pop off additional args
inputs.resize(inputs.size() - additional_args);
if(ignore_modules)
return op.compute_shape(inputs);
return op.compute_shape(inputs, mods); return op.compute_shape(inputs, mods);
} }
......
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/generate.hpp>
#include <miopen/miopen.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape miopen_convolution::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(4).standard();
std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
check_shapes{conv_inputs, *this}.max_ndims(5);
return op.normalize_compute_shape(conv_inputs);
}
inline shape reshape_if_1d(const shape& input)
{
shape new_shape{input};
auto dims = new_shape.lens();
if(dims.size() == 3)
{
std::vector<size_t> new_dims = dims;
new_dims.insert(new_dims.begin() + 2, 1);
new_shape = shape{input.type(), new_dims};
}
return new_shape;
}
argument miopen_convolution::compute(context& ctx,
const shape& output_shape,
const std::vector<argument>& args) const
{
auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()));
auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
auto y_desc = make_tensor(reshape_if_1d(output_shape));
auto* miopen_stream_handle = ctx.get_stream().get_miopen();
auto workspace_size = args[2].get_shape().bytes();
#ifdef MIGRAPHX_HAS_FIND_2_API
{
const miopenTensorArgument_t tensor_args[3] = {
{miopenTensorConvolutionX, nullptr, args[0].implicit()},
{miopenTensorConvolutionW, nullptr, args[1].implicit()},
{miopenTensorConvolutionY, nullptr, args[3].implicit()},
};
if(solution_ptr.get() == nullptr)
MIGRAPHX_THROW("MIOpen Convolution : Load MIOpen Solution before running it");
auto status = miopenRunSolution(miopen_stream_handle,
solution_ptr.get(),
3,
tensor_args,
args[2].implicit(),
workspace_size);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: running convolution using find_2.0 failed");
return args[3];
}
#else
// else use immediate mode
if(solution_id == 0)
MIGRAPHX_THROW("MIOpen Convolution: invalid solution ID");
auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
w_desc.get(),
args[1].implicit(),
x_desc.get(),
args[0].implicit(),
cd.get(),
y_desc.get(),
args[3].implicit(),
args[2].implicit(),
workspace_size,
solution_id);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: running convolution failed");
return args[3];
#endif
}
shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
{
shape workspace_shape{};
auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
auto y_desc = make_tensor(reshape_if_1d(output_shape));
std::size_t workspace_size = 0;
#ifdef MIGRAPHX_HAS_FIND_2_API
{
auto conv_problem = make_obj<miopen_problem>(
&miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
auto* miopen_stream_handle = ctx.get_stream().get_miopen();
solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution : failed to get solution's workspace size");
std::size_t solution_size;
status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: Failed to fetch solution size");
auto solution_binary = std::vector<char>{};
solution_binary.resize(solution_size);
status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: Saving solution failed");
solution_object = value::binary{solution_binary.data(), solution_size};
return shape{shape::int8_type, {workspace_size}};
}
#else
// else use immediate find mode
auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
&workspace_size);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: Failed to get forward workspace size");
workspace_shape = shape{shape::int8_type, {workspace_size}};
auto x = to_gpu(generate_argument(inputs[0]));
auto w = to_gpu(generate_argument(inputs[1]));
auto y = allocate_gpu(output_shape);
auto workspace = allocate_gpu(workspace_shape);
int algo_count = 1;
miopenConvAlgoPerf_t perf;
status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
x_desc.get(),
x.implicit(),
w_desc.get(),
w.implicit(),
cd.get(),
y_desc.get(),
y.implicit(),
1,
&algo_count,
&perf,
workspace.implicit(),
workspace_size,
false);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: find convolution failed");
algo = perf.fwd_algo;
size_t solution_count;
status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
&solution_count);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: get solution count failed");
std::vector<miopenConvSolution_t> solutions(solution_count);
status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
solution_count,
&solution_count,
solutions.data());
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: get solution failed");
solution_id = solutions.front().solution_id;
return shape{shape::int8_type, {perf.memory}};
#endif
}
void miopen_convolution::finalize(context& ctx,
const shape& output_shape,
const std::vector<shape>& inputs)
{
#ifdef MIGRAPHX_HAS_FIND_2_API
{
(void)(ctx); // avoid warnings
(void)(output_shape);
(void)(inputs);
// load solution
if(solution_ptr == nullptr)
{
miopenSolution_t ptr;
auto status = miopenLoadSolution(&ptr,
reinterpret_cast<const char*>(solution_object.data()),
solution_object.size());
solution_ptr = miopen_solution{ptr};
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: loading convolution solution failed");
}
}
#else
// Use immediate mode API
{
if(cd == nullptr)
cd = make_conv(op);
if(solution_id == 0)
{
// Check that workspace hasn't changed
auto size = inputs.at(2).bytes();
auto ws = find(ctx, output_shape, inputs);
if(ws.bytes() > size)
MIGRAPHX_THROW("MIOpen Convolution: workspace has changed during finalization.");
}
auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
auto y_desc = make_tensor(reshape_if_1d(output_shape));
auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
solution_id);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
}
#endif
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/gpu/deconvolution.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/generate.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape miopen_deconvolution::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(4).standard();
std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
check_shapes{conv_inputs, *this}.max_ndims(5);
return op.compute_shape(conv_inputs);
}
inline shape reshape_if_1d(const shape& input)
{
shape new_shape{input};
auto dims = new_shape.lens();
if(dims.size() == 3)
{
std::vector<size_t> new_dims = dims;
new_dims.insert(new_dims.begin() + 2, 1);
new_shape = shape{input.type(), new_dims};
}
return new_shape;
}
argument miopen_deconvolution::compute(context& ctx,
const shape& output_shape,
const std::vector<argument>& args) const
{
auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()));
auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
auto y_desc = make_tensor(reshape_if_1d(output_shape));
if(solution_id == 0)
MIGRAPHX_THROW("MIOpen Deconvolution: invalid solution ID");
auto status = miopenConvolutionForwardImmediate(ctx.get_stream().get_miopen(),
w_desc.get(),
args[1].implicit(),
x_desc.get(),
args[0].implicit(),
cd.get(),
y_desc.get(),
args[3].implicit(),
args[2].implicit(),
args[2].get_shape().bytes(),
solution_id);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Deconvolution: running convolution failed");
return args[3];
}
shape miopen_deconvolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
{
shape workspace_shape{};
auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
auto y_desc = make_tensor(reshape_if_1d(output_shape));
std::size_t workspace_size = 0;
miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
&workspace_size);
workspace_shape = shape{shape::int8_type, {workspace_size}};
auto x = to_gpu(generate_argument(inputs[0]));
auto w = to_gpu(generate_argument(inputs[1]));
auto y = allocate_gpu(output_shape);
auto workspace = allocate_gpu(workspace_shape);
int algo_count = 1;
miopenConvAlgoPerf_t perf;
auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
x_desc.get(),
x.implicit(),
w_desc.get(),
w.implicit(),
cd.get(),
y_desc.get(),
y.implicit(),
1,
&algo_count,
&perf,
workspace.implicit(),
workspace_size,
false);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Deconvolution: find convolution failed");
algo = perf.fwd_algo;
size_t solution_count;
status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
&solution_count);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Deconvolution: get solution count failed");
std::vector<miopenConvSolution_t> solutions(solution_count);
status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
solution_count,
&solution_count,
solutions.data());
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Deconvolution: get solution failed");
solution_id = solutions.front().solution_id;
return shape{shape::int8_type, {perf.memory}};
}
void miopen_deconvolution::finalize(context& ctx,
const shape& output_shape,
std::vector<shape> inputs)
{
if(cd == nullptr)
cd = make_deconv(op);
if(solution_id == 0)
{
// Check that workspace hasn't changed
auto size = inputs.at(2).bytes();
auto ws = find(ctx, output_shape, inputs);
if(ws.bytes() > size)
MIGRAPHX_THROW("MIOpen Deconvolution: workspace has changed during finalization.");
}
auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
auto y_desc = make_tensor(reshape_if_1d(output_shape));
auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
solution_id);
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("MIOpen Deconvolution: compile solution failed");
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/gpu/elu.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
shape miopen_elu::compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(2).not_broadcasted();
return inputs.at(1);
}
argument miopen_elu::compute(context& ctx,
const shape& output_shape,
const std::vector<argument>& args) const
{
float alpha = 1;
float beta = 0;
auto x_desc = make_tensor(args[0].get_shape());
auto y_desc = make_tensor(output_shape);
miopenActivationForward(ctx.get_stream().get_miopen(),
ad.get(),
&alpha,
x_desc.get(),
args[0].implicit(),
&beta,
y_desc.get(),
args[1].implicit());
return args[1];
}
void miopen_elu::finalize(context&, const shape&, const std::vector<shape>&)
{
ad = make_elu(op.alpha);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
...@@ -49,7 +49,7 @@ struct mlir_conv ...@@ -49,7 +49,7 @@ struct mlir_conv
std::string name() const { return "gpu::mlir_conv"; } std::string name() const { return "gpu::mlir_conv"; }
shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
{ {
check_shapes{inputs, *this}.standard(); check_shapes{inputs, *this}.packed_or_broadcasted();
if(mods.size() != 1) if(mods.size() != 1)
MIGRAPHX_THROW("should have one submodule."); MIGRAPHX_THROW("should have one submodule.");
if(inputs.size() < 2) if(inputs.size() < 2)
...@@ -70,6 +70,9 @@ MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins) ...@@ -70,6 +70,9 @@ MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
auto group = v.at("group").to<int>(); auto group = v.at("group").to<int>();
if(group != 1) if(group != 1)
return false; return false;
// Avoid MLIR assertion: Index < Length && "Invalid index!"
if(ins->get_shape().lens().size() != 4)
return false;
return true; return true;
} }
...@@ -96,9 +99,10 @@ struct find_conv_pointwise ...@@ -96,9 +99,10 @@ struct find_conv_pointwise
i.name()); i.name());
})) }))
return; return;
// Only fuse with fp32 for now // Only fuse with fp32/fp16
if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) { if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
return i->get_shape().type() != shape::type_t::float_type; return not contains({shape::type_t::float_type, shape::type_t::half_type},
i->get_shape().type());
})) }))
return; return;
std::sort(names.begin(), names.end()); std::sort(names.begin(), names.end());
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include <migraphx/gpu/fuse_ops.hpp> #include <migraphx/gpu/fuse_ops.hpp>
#include <migraphx/matcher.hpp> #include <migraphx/matcher.hpp>
#include <migraphx/gpu/miopen.hpp> #include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/device_name.hpp> #include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/oper.hpp> #include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/gemm.hpp> #include <migraphx/gpu/gemm.hpp>
...@@ -190,10 +189,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins) ...@@ -190,10 +189,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
return false; return false;
auto wei = ins->inputs().at(1)->get_shape(); auto wei = ins->inputs().at(1)->get_shape();
assert(wei.lens().size() == 4); assert(wei.lens().size() == 4);
auto conv = any_cast<miopen_convolution>(ins->get_operator()); auto miopen_conv_op = ins->get_operator().to_value();
if(conv.op.group > 1) auto algo = miopen_conv_op.at("algo").to<miopenConvFwdAlgorithm_t>();
auto conv_op = from_value<op::convolution>(miopen_conv_op["op"]);
if(conv_op.group > 1)
return false; return false;
if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd) if(wei.lens()[1] > 512 and algo != miopenConvolutionFwdAlgoWinograd)
return false; return false;
// Do not fuse non-symmetric input // Do not fuse non-symmetric input
...@@ -201,13 +202,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins) ...@@ -201,13 +202,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
if(input_lens[2] != input_lens[3] or wei.lens()[2] != wei.lens()[3]) if(input_lens[2] != input_lens[3] or wei.lens()[2] != wei.lens()[3])
return false; return false;
auto op = conv.op;
// Dont fuse winograd for non-3x3s since there is no fused windograd for those configs // Dont fuse winograd for non-3x3s since there is no fused windograd for those configs
if(conv.algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and if(algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and wei.lens()[3] != 3 and
wei.lens()[3] != 3 and contains({{1, 1}}, op.stride)) contains({{1, 1}}, conv_op.stride))
return false; return false;
return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, op.padding) and return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, conv_op.padding) and
contains({{0, 0}, {1, 1}}, op.stride) and contains({{1, 1}}, op.dilation); contains({{0, 0}, {1, 1}}, conv_op.stride) and contains({{1, 1}}, conv_op.dilation);
} }
void move_broadcasted_back(std::vector<instruction_ref>& args) void move_broadcasted_back(std::vector<instruction_ref>& args)
...@@ -462,7 +462,7 @@ void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r) ...@@ -462,7 +462,7 @@ void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
auto ins = r.result; auto ins = r.result;
auto input_ins = conv_ins->inputs().at(0); auto input_ins = conv_ins->inputs().at(0);
auto weights_ins = conv_ins->inputs().at(1); auto weights_ins = conv_ins->inputs().at(1);
auto conv_op = any_cast<miopen_convolution>(conv_ins->get_operator()).op; auto conv_op = from_value<op::convolution>((conv_ins->get_operator()).to_value()["op"]);
auto alloc_ins = ins->inputs().back(); auto alloc_ins = ins->inputs().back();
auto old_ws_ins = conv_ins->inputs().at(2); auto old_ws_ins = conv_ins->inputs().at(2);
...@@ -528,7 +528,7 @@ struct find_conv_pointwise ...@@ -528,7 +528,7 @@ struct find_conv_pointwise
auto ins = r.result; auto ins = r.result;
auto input_ins = conv_ins->inputs().at(0); auto input_ins = conv_ins->inputs().at(0);
auto weights_ins = conv_ins->inputs().at(1); auto weights_ins = conv_ins->inputs().at(1);
auto conv_op = any_cast<miopen_convolution>(conv_ins->get_operator()).op; auto conv_op = from_value<op::convolution>(conv_ins->get_operator().to_value()["op"]);
auto alloc_ins = ins->inputs().back(); auto alloc_ins = ins->inputs().back();
module_ref pm = ins->module_inputs().front(); module_ref pm = ins->module_inputs().front();
...@@ -553,11 +553,13 @@ struct find_gemm_pointwise ...@@ -553,11 +553,13 @@ struct find_gemm_pointwise
{ {
auto matcher() const auto matcher() const
{ {
return precompile_name("pointwise")( auto gemm_op = match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm");
auto binary_op = match::all_of(
match::nargs(3), match::nargs(3),
match::either_arg(0, 1)( match::either_arg(0, 1)(
match::any_of(match::standard_shape(), match::is_constant()).bind("c"), match::any_of(match::standard_shape(), match::is_constant()).bind("c"), gemm_op));
match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm"))); auto unary_op = match::all_of(match::nargs(2), match::arg(0)(gemm_op));
return precompile_name("pointwise")(match::any_of(binary_op, unary_op));
} }
// TODO: Move to matcher.hpp // TODO: Move to matcher.hpp
...@@ -589,15 +591,29 @@ struct find_gemm_pointwise ...@@ -589,15 +591,29 @@ struct find_gemm_pointwise
return match::name("@return")(match::args(match::any_of(add, mul_add, add_mul))); return match::name("@return")(match::args(match::any_of(add, mul_add, add_mul)));
} }
static auto match_mul(const std::string& input)
{
auto mul = match_mul_const(match_param(input), "alpha");
return match::name("@return")(match::args(mul));
}
static float get_float(instruction_ref ins) { return ins->get_literal().at<float>(); } static float get_float(instruction_ref ins) { return ins->get_literal().at<float>(); }
template <class Gemm> template <class Gemm>
static bool update_gemm(Gemm& gemm, module_ref pm, unsigned input) static bool update_gemm(Gemm& gemm, module_ref pm, unsigned input)
{ {
auto names = pm->get_parameter_names(); auto names = pm->get_parameter_names();
if(names.size() != 2)
return false;
std::sort(names.begin(), names.end()); std::sort(names.begin(), names.end());
if(names.size() == 1)
{
auto mr = match::match_instruction(*pm, std::prev(pm->end()), match_mul(names[input]));
if(mr.result == pm->end())
return false;
gemm.alpha *= get_float(mr.instructions["alpha"]);
return true;
}
else if(names.size() == 2)
{
unsigned output = input == 0 ? 1 : 0; unsigned output = input == 0 ? 1 : 0;
auto mr = match::match_instruction( auto mr = match::match_instruction(
*pm, std::prev(pm->end()), match_add(names[input], names[output])); *pm, std::prev(pm->end()), match_add(names[input], names[output]));
...@@ -614,24 +630,35 @@ struct find_gemm_pointwise ...@@ -614,24 +630,35 @@ struct find_gemm_pointwise
} }
return true; return true;
} }
else
{
return false;
}
}
void apply(module& m, const match::matcher_result& r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto gemm_ins = r.instructions["gemm"]; auto gemm_ins = r.instructions["gemm"];
auto c_ins = r.instructions["c"];
auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator()); auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
// Already fused gemm // Already fused gemm
if(not float_equal(gemm.beta, 0)) if(not float_equal(gemm.beta, 0))
return; return;
if(ins->inputs().size() == 3)
gemm.beta = 1; gemm.beta = 1;
if(not update_gemm( if(not update_gemm(
gemm, ins->module_inputs().front(), ins->inputs().front() == gemm_ins ? 0 : 1)) gemm, ins->module_inputs().front(), ins->inputs().front() == gemm_ins ? 0 : 1))
return; return;
auto inputs = gemm_ins->inputs();
inputs.pop_back();
if(ins->inputs().size() == 3)
{
auto c_ins = r.instructions["c"];
// const-fold input if not standard shape since rocblas can't handle it // const-fold input if not standard shape since rocblas can't handle it
if(not c_ins->get_shape().standard()) if(not c_ins->get_shape().standard())
{ {
...@@ -639,11 +666,9 @@ struct find_gemm_pointwise ...@@ -639,11 +666,9 @@ struct find_gemm_pointwise
auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()}); auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
c_ins = m.add_literal(l.get_shape(), l.data()); c_ins = m.add_literal(l.get_shape(), l.data());
} }
auto inputs = gemm_ins->inputs();
inputs.pop_back();
inputs.push_back(c_ins); inputs.push_back(c_ins);
}
inputs.push_back(ins->inputs().back()); inputs.push_back(ins->inputs().back());
m.replace_instruction(ins, gemm, inputs); m.replace_instruction(ins, gemm, inputs);
...@@ -772,11 +797,9 @@ struct find_layernorm_pointwise ...@@ -772,11 +797,9 @@ struct find_layernorm_pointwise
{ {
auto ins = r.result; auto ins = r.result;
auto layernorm = r.instructions["layernorm"]; auto layernorm = r.instructions["layernorm"];
auto* pm = ins->module_inputs().front();
if(not layernorm->module_inputs().empty()) if(not layernorm->module_inputs().empty())
return; return;
auto* pm = ins->module_inputs().front();
auto inputs = layernorm->inputs(); auto inputs = layernorm->inputs();
inputs.pop_back(); inputs.pop_back();
inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end()); inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
...@@ -785,6 +808,37 @@ struct find_layernorm_pointwise ...@@ -785,6 +808,37 @@ struct find_layernorm_pointwise
} }
}; };
struct find_concat_pointwise
{
auto matcher() const
{
return precompile_name("pointwise")(
match::arg(0)(precompile_name("concat").bind("concat")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto concat = r.instructions["concat"];
if(not concat->module_inputs().empty())
return;
// TODO: Handle type conversions
if(ins->get_shape().type() != concat->get_shape().type())
return;
auto* pm = ins->module_inputs().front();
auto inputs = concat->inputs();
inputs.pop_back();
inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
auto op = concat->get_operator();
op.from_value({{"additional_args", ins->inputs().size() - 1}, {"ignore_modules", true}});
m.replace_instruction(ins, op, inputs, {pm});
}
};
void fuse_ops::apply(module& m) const void fuse_ops::apply(module& m) const
{ {
match::find_matches(m, find_contiguous_pointwise{}); match::find_matches(m, find_contiguous_pointwise{});
...@@ -793,6 +847,7 @@ void fuse_ops::apply(module& m) const ...@@ -793,6 +847,7 @@ void fuse_ops::apply(module& m) const
run_passes(m, {dead_code_elimination{}}); run_passes(m, {dead_code_elimination{}});
match::find_matches(m, match::find_matches(m,
find_layernorm_pointwise{}, find_layernorm_pointwise{},
find_concat_pointwise{},
find_gemm_pointwise{}, find_gemm_pointwise{},
find_contiguous_tranpose_gemm{}, find_contiguous_tranpose_gemm{},
find_commutative_broadcast{}); find_commutative_broadcast{});
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE. * THE SOFTWARE.
*/ */
#include <rocblas.h> #include <rocblas/rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp> #include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp> #include <migraphx/reduce_dims.hpp>
#include <migraphx/permutation.hpp> #include <migraphx/permutation.hpp>
......
...@@ -183,8 +183,8 @@ argument register_on_gpu(const argument& arg) ...@@ -183,8 +183,8 @@ argument register_on_gpu(const argument& arg)
{ {
auto arg_shared = arg.share(); auto arg_shared = arg.share();
auto p = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes()); auto p = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
return {arg_shared.get_shape(), auto s = arg_shared.get_shape();
[p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }}; return {s, [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
} }
argument to_gpu(const argument& arg, bool host) argument to_gpu(const argument& arg, bool host)
......
...@@ -21,41 +21,31 @@ ...@@ -21,41 +21,31 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE. * THE SOFTWARE.
*/ */
#ifndef MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP #ifndef MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
#define MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP #define MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
#include <migraphx/argument.hpp> #include <migraphx/config.hpp>
#include <migraphx/op/batch_norm_inference.hpp> #include <migraphx/instruction_ref.hpp>
#include <migraphx/reflect.hpp> #include <string>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct module;
struct context; struct context;
struct operation;
struct miopen_batch_norm_inference namespace gpu {
{
op::batch_norm_inference op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "gpu::batch_norm_inference"; } struct compile_miopen
shape compute_shape(const std::vector<shape>& inputs) const; {
argument context* ctx = nullptr;
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const; std::string name() const { return "gpu::compile_miopen"; }
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const void apply(module& m) const;
{ std::size_t compile(operation& op, instruction_ref ins, bool format) const;
return shapes.size() - 1;
}
}; };
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_MIOPEN_HPP
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment