Commit 2fc6b715 authored by Paul's avatar Paul
Browse files

Merge

parents 5967d68d 118e05c7
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/filesystem.hpp> #include <migraphx/filesystem.hpp>
#include <migraphx/compile_src.hpp> #include <migraphx/compile_src.hpp>
#include <migraphx/functional.hpp>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -35,6 +36,26 @@ namespace migraphx { ...@@ -35,6 +36,26 @@ namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
struct hiprtc_src_file
{
hiprtc_src_file() = default;
hiprtc_src_file(const src_file& s)
: path(s.path.string()), content(s.content.first, s.content.second)
{
}
std::string path;
std::string content;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.path, "path"), f(self.content, "content"));
}
};
std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
std::string params,
const std::string& arch);
std::vector<std::vector<char>> std::vector<std::vector<char>>
compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch); compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);
......
...@@ -71,6 +71,8 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option ...@@ -71,6 +71,8 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024); std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
std::string generate_make_shape(const shape& s);
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
......
...@@ -21,13 +21,12 @@ ...@@ -21,13 +21,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE. * THE SOFTWARE.
*/ */
#ifndef MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP
#define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP #define MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP
#include <migraphx/shape.hpp> #include <migraphx/shape.hpp>
#include <migraphx/generate.hpp> #include <migraphx/generate.hpp>
#include <migraphx/operation.hpp> #include <migraphx/operation.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/gpu/miopen.hpp> #include <migraphx/gpu/miopen.hpp>
#include <migraphx/op/identity.hpp> #include <migraphx/op/identity.hpp>
#include <migraphx/op/convolution.hpp> #include <migraphx/op/convolution.hpp>
......
...@@ -98,6 +98,13 @@ struct hip_sync_stream ...@@ -98,6 +98,13 @@ struct hip_sync_stream
return {}; return {};
return args.front(); return args.front();
} }
std::ptrdiff_t output_alias(const std::vector<shape>& args) const
{
if(args.empty())
return -1;
return 0;
}
}; };
struct hip_copy_to_gpu struct hip_copy_to_gpu
......
...@@ -56,7 +56,6 @@ struct oper ...@@ -56,7 +56,6 @@ struct oper
return name.substr(pos_ns + 2); return name.substr(pos_ns + 2);
} }
} }
return "unknown_operator_name"; return "unknown_operator_name";
} }
}; };
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#include <migraphx/argument.hpp> #include <migraphx/argument.hpp>
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/reduce_dims.hpp> #include <migraphx/reduce_dims.hpp>
#include <migraphx/type_name.hpp>
#include <utility> #include <utility>
#include <iostream> #include <iostream>
......
...@@ -33,7 +33,6 @@ ...@@ -33,7 +33,6 @@
#include <migraphx/shape.hpp> #include <migraphx/shape.hpp>
#include <migraphx/argument.hpp> #include <migraphx/argument.hpp>
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/type_name.hpp>
#include <utility> #include <utility>
#include <iostream> #include <iostream>
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#include <migraphx/shape.hpp> #include <migraphx/shape.hpp>
#include <migraphx/argument.hpp> #include <migraphx/argument.hpp>
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/type_name.hpp>
#include <utility> #include <utility>
#include <iostream> #include <iostream>
......
...@@ -37,7 +37,6 @@ struct target ...@@ -37,7 +37,6 @@ struct target
std::string name() const; std::string name() const;
std::vector<pass> get_passes(migraphx::context& gctx, const compile_options& options) const; std::vector<pass> get_passes(migraphx::context& gctx, const compile_options& options) const;
migraphx::context get_context() const; migraphx::context get_context() const;
argument copy_to(const argument& arg) const; argument copy_to(const argument& arg) const;
argument copy_from(const argument& arg) const; argument copy_from(const argument& arg) const;
argument allocate(const shape& s) const; argument allocate(const shape& s) const;
......
...@@ -78,7 +78,9 @@ struct concat_compiler : compiler<concat_compiler> ...@@ -78,7 +78,9 @@ struct concat_compiler : compiler<concat_compiler>
options.params = "-Wno-float-equal"; options.params = "-Wno-float-equal";
options.kernel_name = v.get("kernel", "concat_kernel"); options.kernel_name = v.get("kernel", "concat_kernel");
auto axis = find_fast_axis(options.inputs); auto axis = find_fast_axis(options.inputs);
auto vec = vectorize::elements(ctx, axis, options.inputs); vectorize vec{};
if(axis != v.at("axis").to<std::size_t>())
vec = vectorize::elements(ctx, axis, options.inputs);
options.set_launch_params( options.set_launch_params(
v, compute_global_for(ctx, get_concat_elements(options.inputs) / vec.size, 256)); v, compute_global_for(ctx, get_concat_elements(options.inputs) / vec.size, 256));
auto src = interpolate_string( auto src = interpolate_string(
......
...@@ -32,7 +32,7 @@ namespace gpu { ...@@ -32,7 +32,7 @@ namespace gpu {
struct mlir_compiler : compiler<mlir_compiler> struct mlir_compiler : compiler<mlir_compiler>
{ {
std::vector<std::string> names() const { return {"gpu::mlir_conv"}; } std::vector<std::string> names() const { return {"gpu::mlir_op"}; }
operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; } operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
......
...@@ -60,15 +60,6 @@ __global__ void reduce_kernel(void* input_p, void* output_p) ...@@ -60,15 +60,6 @@ __global__ void reduce_kernel(void* input_p, void* output_p)
)__migraphx__"; )__migraphx__";
static std::size_t get_reduce_elements(const std::vector<shape>& inputs)
{
return inputs.front().elements() / inputs.back().elements();
}
static std::size_t get_reduce_elements(const std::vector<instruction_ref>& inputs)
{
return get_reduce_elements(to_shapes(inputs));
}
static std::vector<std::size_t> get_reduce_lens(const std::vector<std::size_t>& input_lens, static std::vector<std::size_t> get_reduce_lens(const std::vector<std::size_t>& input_lens,
const std::vector<std::size_t>& output_lens) const std::vector<std::size_t>& output_lens)
{ {
...@@ -86,9 +77,28 @@ static std::vector<std::size_t> get_reduce_lens(const std::vector<std::size_t>& ...@@ -86,9 +77,28 @@ static std::vector<std::size_t> get_reduce_lens(const std::vector<std::size_t>&
return reduce_lens; return reduce_lens;
} }
static std::string get_reduce_algo(const std::vector<shape>& inputs) template <class T>
static shape get_reduced_shape(const shape& s, const std::vector<T>& axes)
{
auto lens = s.lens();
std::fill(lens.begin(), lens.end(), 1);
for(const auto& axis : axes)
lens[axis] = s.lens()[axis];
return shape{s.type(), lens};
}
template <class T>
static shape get_output_shape(const shape& s, const std::vector<T>& axes)
{
auto lens = s.lens();
for(const auto& axis : axes)
lens[axis] = 1;
return shape{s.type(), lens};
}
template <class ReduceLens>
static std::string get_reduce_algo(const std::vector<shape>& inputs, ReduceLens rlens)
{ {
auto rlens = get_reduce_lens(inputs.front().lens(), inputs.back().lens());
const auto init = std::numeric_limits<std::size_t>::max(); const auto init = std::numeric_limits<std::size_t>::max();
// The minimum stride // The minimum stride
auto min_stride = std::inner_product( auto min_stride = std::inner_product(
...@@ -103,11 +113,27 @@ static std::string get_reduce_algo(const std::vector<shape>& inputs) ...@@ -103,11 +113,27 @@ static std::string get_reduce_algo(const std::vector<shape>& inputs)
return "block"; return "block";
} }
struct reduce_compiler : compiler<reduce_compiler> static std::string get_reduce_algo(const std::vector<shape>& inputs)
{
auto rlens = get_reduce_lens(inputs.front().lens(), inputs.back().lens());
return get_reduce_algo(inputs, rlens);
}
struct simple_reduce_compiler : compiler<simple_reduce_compiler>
{ {
std::vector<std::string> names() const std::vector<std::string> names() const
{ {
return {"reduce", "reduce_sum", "reduce_mean", "reduce_max", "reduce_min", "reduce_prod"}; return {"simple_reduce",
"reduce_sum",
"reduce_mean",
"reduce_max",
"reduce_min",
"reduce_prod"};
}
static std::size_t get_reduce_elements(const std::vector<shape>& inputs)
{
return inputs.front().elements() / inputs.back().elements();
} }
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
...@@ -127,7 +153,7 @@ struct reduce_compiler : compiler<reduce_compiler> ...@@ -127,7 +153,7 @@ struct reduce_compiler : compiler<reduce_compiler>
vec = vectorize::elements(ctx, faxis, options.virtual_inputs); vec = vectorize::elements(ctx, faxis, options.virtual_inputs);
auto relements = get_reduce_elements(options.virtual_inputs) / vec.size; auto relements = get_reduce_elements(options.virtual_inputs) / vec.size;
auto block_size = compute_block_size(relements, 256); auto block_size = compute_block_size(relements, 256);
if(relements > block_size * 256) if(relements >= block_size * 256)
algo = "block_large"; algo = "block_large";
options.set_launch_params( options.set_launch_params(
v, compute_global_for(ctx, nelements * block_size, 256), block_size); v, compute_global_for(ctx, nelements * block_size, 256), block_size);
...@@ -157,44 +183,108 @@ struct reduce_compiler : compiler<reduce_compiler> ...@@ -157,44 +183,108 @@ struct reduce_compiler : compiler<reduce_compiler>
compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
{ {
value v = value::object{}; value v = value::object{};
if(op.name() == "reduce_sum") reduce_op r{};
{ r.set(ins, op);
v["reduction"] = "op::sum{}"; v["reduction"] = r.reduction;
} v["read"] = r.read;
else if(op.name() == "reduce_mean") v["write"] = r.write;
{ v["init"] = r.init;
auto reduce_elements = get_reduce_elements(ins->inputs()); return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
auto reduce_type = ins->inputs().front()->get_shape().type(); }
v["reduction"] = "op::sum{}"; };
std::string mean = "op::mean<" + std::to_string(reduce_elements) + ">{}";
// Use float accumulator when reduction size is too large for half static const char* const fused_reduce_kernel = R"__migraphx__(
if(reduce_type == shape::half_type and reduce_elements > 16384) #include <migraphx/kernels/index.hpp>
v["read"] = "compose(" + mean + ", op::convert_to<float>{})"; #include <migraphx/kernels/reduce.hpp>
else if(contains({shape::float_type, shape::half_type, shape::double_type}, #include <migraphx/kernels/pointwise.hpp>
reduce_type)) #include <migraphx/kernels/vectorize.hpp>
v["read"] = mean; #include <args.hpp>
else
v["write"] = mean; namespace migraphx {
}
else if(op.name() == "reduce_max") ${preamble}
{
v["reduction"] = "op::max{}"; extern "C" {
v["init"] = "lowest{}"; MIGRAPHX_GLOBAL void ${kernel}(${params})
} {
else if(op.name() == "reduce_min") transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, auto... xs) {
fused_reduce<reduce::${algo}, ${reduced}>(y, partial(${lambda})(xs...));
});
}
}
} // namespace migraphx
)__migraphx__";
struct fused_reduce_compiler : compiler<fused_reduce_compiler>
{
std::vector<std::string> names() const { return {"fused_reduce"}; }
operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{
auto axes = v.at("axes").to_vector<std::size_t>();
auto virtual_inputs = inputs;
virtual_inputs.push_back(get_reduced_shape(inputs.front(), axes));
virtual_inputs.push_back(get_output_shape(inputs.front(), axes));
virtual_inputs = reduce_dims(virtual_inputs);
auto reduce_output_shape = virtual_inputs.back();
virtual_inputs.pop_back();
auto reduction_shape = virtual_inputs.back();
virtual_inputs.pop_back();
hip_compile_options options;
options.inputs = inputs;
options.output = inputs.back();
options.virtual_inputs = virtual_inputs;
auto faxis = find_fast_axis({options.virtual_inputs.front()});
vectorize vec{};
auto nelements = reduce_output_shape.elements();
auto algo = v.get("algo", get_reduce_algo(options.virtual_inputs, reduction_shape.lens()));
if(algo == "block")
{ {
v["reduction"] = "op::min{}"; // Vectorize if the axis is a reduction axis
v["init"] = "highest{}"; if(reduce_output_shape.lens()[faxis] == 1)
vec = vectorize::elements(ctx, faxis, options.virtual_inputs);
auto relements = reduction_shape.elements() / vec.size;
auto block_size = compute_block_size(relements, 256);
if(relements >= block_size * 256)
algo = "block_large";
options.set_launch_params(
v, compute_global_for(ctx, nelements * block_size, 256), block_size);
} }
else if(op.name() == "reduce_prod") else if(algo == "lane")
{ {
v["reduction"] = "op::product{}"; options.set_launch_params(v, compute_global_for(ctx, nelements, 256));
v["init"] = "1";
} }
else else
{ {
MIGRAPHX_THROW("Unsupported reduce"); MIGRAPHX_THROW("Unknown reduce algo: " + algo);
} }
options.kernel_name = v.get("kernel", "reduce_kernel");
auto src = interpolate_string(
fused_reduce_kernel,
{{"kernel", options.kernel_name},
{"params", enum_params(inputs.size(), "void * private_p")},
{"args", enum_params(inputs.size(), "private_p")},
{"algo", algo},
{"reduced", "decltype(" + generate_make_shape(reduce_output_shape) + ")"},
{"lambda", v.at("lambda").to<std::string>()},
{"transformers", make_transformer_args(vec)},
{"preamble", v.get("preamble", std::string{})}});
options.params += "-Wno-float-equal";
return compile_hip_code_object(src, options);
}
compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
{
assert(not ins->module_inputs().empty());
auto v = op.to_value();
auto* rm = ins->module_inputs().front();
v["preamble"] = generate_reduce(*rm, "fused_reduce_op");
v["lambda"] = "MIGRAPHX_LIFT(fused_reduce_op)";
v["kernel"] = generate_name_from_ops(*rm) + "_kernel";
return replace(compile_op(ctx, to_shapes(ins->inputs()), v)); return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
} }
}; };
......
...@@ -195,6 +195,14 @@ constexpr auto compose(Fs... fs) ...@@ -195,6 +195,14 @@ constexpr auto compose(Fs... fs)
})(fs...); })(fs...);
} }
template <class F>
constexpr auto partial(F f)
{
return [=](auto... xs) {
return [=](auto&&... ys) { return f(xs..., static_cast<decltype(ys)>(ys)...); };
};
}
template <class... Ts> template <class... Ts>
constexpr auto pack(Ts... xs) constexpr auto pack(Ts... xs)
{ {
......
...@@ -233,6 +233,12 @@ struct index ...@@ -233,6 +233,12 @@ struct index
} }
}; };
#ifdef MIGRAPHX_NLOCAL
#define MIGRAPHX_GLOBAL \
__global__ __attribute__((amdgpu_flat_work_group_size(MIGRAPHX_NLOCAL, MIGRAPHX_NLOCAL)))
#else
#define MIGRAPHX_GLOBAL __global__
#endif
inline __device__ __attribute__((const)) index make_index() inline __device__ __attribute__((const)) index make_index()
{ {
return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
......
...@@ -174,6 +174,25 @@ struct inner_storage_tag ...@@ -174,6 +174,25 @@ struct inner_storage_tag
template <class T> template <class T>
using is_inner_storage = is_base_of<inner_storage_tag, remove_cv_t<remove_reference_t<T>>>; using is_inner_storage = is_base_of<inner_storage_tag, remove_cv_t<remove_reference_t<T>>>;
template <class Size, class F>
struct lazy_inner_storage : inner_storage_tag
{
using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
F f;
constexpr Size rsize() const { return {}; }
template <class U, class V>
constexpr auto operator()(U j, V d) const
{
return f(j, d);
}
};
template <class Size, class F>
constexpr lazy_inner_storage<Size, F> make_lazy_inner_storage(Size, F f)
{
return {{}, f};
}
template <class R, class F> template <class R, class F>
struct storage_access : F struct storage_access : F
{ {
...@@ -278,6 +297,14 @@ struct reducer_base ...@@ -278,6 +297,14 @@ struct reducer_base
}); });
} }
template <class F>
__device__ auto lazy_inner(F f) const
{
return this->inner_sliced([=](auto n, auto&&... xs) {
return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
});
}
template <class Op, class T, class Read> template <class Op, class T, class Read>
__device__ auto reduce(Op op, T init, Read read) const __device__ auto reduce(Op op, T init, Read read) const
{ {
...@@ -396,25 +423,6 @@ struct block_large ...@@ -396,25 +423,6 @@ struct block_large
index idx; index idx;
Slicer slice; Slicer slice;
template <class Size, class F>
struct inner_storage : inner_storage_tag
{
using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
F f;
constexpr Size rsize() const { return {}; }
template <class U, class V>
constexpr auto operator()(U j, V d) const
{
return f(j, d);
}
};
template <class Size, class F>
constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
{
return {f};
}
template <class Op, class T, class Read, class N, class... Ts> template <class Op, class T, class Read, class N, class... Ts>
__device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const __device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
{ {
...@@ -439,7 +447,7 @@ struct block_large ...@@ -439,7 +447,7 @@ struct block_large
template <class R, class F, class N, class... Ts> template <class R, class F, class N, class... Ts>
__device__ auto inner_impl(F f, N n, Ts&&... xs) const __device__ auto inner_impl(F f, N n, Ts&&... xs) const
{ {
return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); }); return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
} }
}; };
...@@ -469,25 +477,6 @@ struct lane ...@@ -469,25 +477,6 @@ struct lane
index idx; index idx;
Slicer slice; Slicer slice;
template <class Size, class F>
struct inner_storage : inner_storage_tag
{
using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
F f;
constexpr Size rsize() const { return {}; }
template <class U, class V>
constexpr auto operator()(U j, V d) const
{
return f(j, d);
}
};
template <class Size, class F>
constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
{
return {f};
}
template <class Op, class T, class Read, class N, class U, class... Us> template <class Op, class T, class Read, class N, class U, class... Us>
__device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const __device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const
{ {
...@@ -518,7 +507,7 @@ struct lane ...@@ -518,7 +507,7 @@ struct lane
template <class R, class F, class N, class... Ts> template <class R, class F, class N, class... Ts>
__device__ auto inner_impl(F f, N n, Ts&&... xs) const __device__ auto inner_impl(F f, N n, Ts&&... xs) const
{ {
return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); }); return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
} }
}; };
template <class Slicer> template <class Slicer>
...@@ -577,5 +566,21 @@ simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOu ...@@ -577,5 +566,21 @@ simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOu
}); });
} }
template <class Algo, class Reduced, class Output, class F>
__device__ void fused_reduce(Output output, F f)
{
Algo::template run<Reduced>([&](auto out_idx, auto r) {
auto result = f(r);
if constexpr(reduce::is_inner_storage<decltype(result)>{})
{
r.inner([&](auto& y, auto x) { y = x; })(output, result);
}
else
{
r.outer([&] { output[out_idx] = implicit_conversion(result); });
}
});
}
} // namespace migraphx } // namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP #endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
...@@ -361,29 +361,16 @@ struct miopen_apply ...@@ -361,29 +361,16 @@ struct miopen_apply
} }
/** /**
* Turns on use_local_alloc in the select_module submodules. * Adds dynamic allocation for submodule output parameter.
* Changes the submodule returns to a hip::sync_stream.
*/ */
void add_select_module_op() void add_select_module_op()
{ {
apply_map.emplace("select_module", [=](instruction_ref ins) { apply_map.emplace("select_module", [=](instruction_ref ins) {
auto s = ins->get_shape();
auto output = insert_allocation(ins, s);
std::vector<instruction_ref> inputs = ins->inputs(); std::vector<instruction_ref> inputs = ins->inputs();
auto mod_args = ins->module_inputs(); inputs.push_back(output);
for(auto* smod : mod_args) return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
{
smod->use_local_alloc = true;
auto last_ins = std::prev(smod->end());
if(last_ins->name() == "@return")
{
for(auto out_ins : last_ins->inputs())
{
auto sync_out = smod->insert_instruction(
last_ins, make_op("hip::sync_stream"), out_ins);
smod->replace_return({sync_out});
}
}
}
return ins;
}); });
} }
}; };
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include <mlir-c/BuiltinTypes.h> #include <mlir-c/BuiltinTypes.h>
#include <mlir-c/Diagnostics.h> #include <mlir-c/Diagnostics.h>
#include <mlir-c/Dialect/MIGraphX.h> #include <mlir-c/Dialect/MIGraphX.h>
#include <mlir-c/Dialect/Rock.h>
#include <mlir-c/IntegerSet.h> #include <mlir-c/IntegerSet.h>
#include <mlir-c/Pass.h> #include <mlir-c/Pass.h>
#include <mutex> #include <mutex>
...@@ -55,12 +56,16 @@ ...@@ -55,12 +56,16 @@
#include <migraphx/permutation.hpp> #include <migraphx/permutation.hpp>
#include <deque> #include <deque>
#include <variant> #include <variant>
#include <fstream>
#include <sstream>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MLIR); MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MLIR);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_DB);
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_CFG);
#ifdef MIGRAPHX_MLIR #ifdef MIGRAPHX_MLIR
template <class T, class F, F f> // NOLINT template <class T, class F, F f> // NOLINT
...@@ -124,6 +129,8 @@ using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags, ...@@ -124,6 +129,8 @@ using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
using mlir_region = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRegion, mlirRegionDestroy); using mlir_region = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRegion, mlirRegionDestroy);
using mlir_block = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockDestroy); using mlir_block = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockDestroy);
using mlir_pass_manager = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy); using mlir_pass_manager = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
using mlir_tuning_table = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningTable,
mlirRockTuningTableDestroy);
std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; } std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }
...@@ -455,7 +462,7 @@ struct mlir_program ...@@ -455,7 +462,7 @@ struct mlir_program
auto ops = create_operation_state("func.func"); auto ops = create_operation_state("func.func");
ops.add_attributes({{"function_type", make_function_type(inputs, outputs)}, ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
{"sym_name", std::string("main")}, {"sym_name", sym_name},
{"kernel", std::string("mixr")}, {"kernel", std::string("mixr")},
{"arch", target_arch}}); {"arch", target_arch}});
ops.add_region(std::move(region)); ops.add_region(std::move(region));
...@@ -498,11 +505,25 @@ struct mlir_program ...@@ -498,11 +505,25 @@ struct mlir_program
return ins->get_shape(); return ins->get_shape();
} }
static std::string get_symbol_name(const module& m)
{
for(auto ins : iterator_for(m))
{
if(ins->name() == "convolution" or ins->name() == "dot")
{
return "mlir_" + ins->name();
}
}
return "main";
}
void parse(const module& m) void parse(const module& m)
{ {
sym_name = get_symbol_name(m);
auto mbody = mlirModuleGetBody(mmodule.get()); auto mbody = mlirModuleGetBody(mmodule.get());
std::unordered_map<instruction_ref, MlirValue> ins_map; std::unordered_map<instruction_ref, MlirValue> ins_map;
auto fbody = insert(mbody, m, ins_map); auto fbody = insert(mbody, m, ins_map);
for(auto ins : iterator_for(m)) for(auto ins : iterator_for(m))
{ {
if(ins->name() == "@param") if(ins->name() == "@param")
...@@ -512,16 +533,13 @@ struct mlir_program ...@@ -512,16 +533,13 @@ struct mlir_program
ops.add_attribute_value(get_operator_value(ins->get_operator())); ops.add_attribute_value(get_operator_value(ins->get_operator()));
if(ins->name() != "@return") if(ins->name() != "@return")
ops.add_results({get_shape(ins)}); ops.add_results({get_shape(ins)});
if(ins->name() == "convolution") if(ins->name() == "convolution" or ins->name() == "dot")
{ {
pp = pp =
problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()}; problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
// check if HW supports xdlops // check if HW supports xdlops
auto target_chip = trim(split_string(target_arch, ':').front()); auto target_chip = trim(split_string(target_arch, ':').front());
bool xdlops = contains(get_xdlops_archs(), target_chip); bool xdlops = contains(get_xdlops_archs(), target_chip);
std::string tuned = get_tune_params(xdlops);
if(not tuned.empty())
ops.add_attributes({{"perf_config", tuned}});
if(xdlops) if(xdlops)
ops.add_attributes({{"xdlopsV2", true}}); ops.add_attributes({{"xdlopsV2", true}});
} }
...@@ -542,15 +560,19 @@ struct mlir_program ...@@ -542,15 +560,19 @@ struct mlir_program
code_object_op compile() MIGRAPHX_TIDY_CONST code_object_op compile() MIGRAPHX_TIDY_CONST
{ {
mlir_pass_manager pm{mlirPassManagerCreate(ctx.get())}; mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
// 1st pipeline to call // 1st pipeline to call
mlirMIGraphXAddHighLevelPipeline(pm.get()); mlirMIGraphXAddHighLevelPipeline(pm_front.get());
mlirPassManagerRun(pm_front.get(), mmodule.get());
// 2nd pipeline to call // 2nd pipeline to call
mlirMIGraphXAddBackendPipeline(pm.get(), target_arch.c_str()); get_module_tuned();
mlirPassManagerRun(pm.get(), mmodule.get()); mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
mlirPassManagerRun(pm_back.get(), mmodule.get());
code_object_op op{}; code_object_op op{};
op.symbol_name = "main"; op.symbol_name = sym_name;
op.code_object = get_binary(); op.code_object = get_binary();
std::tie(op.global, op.local) = get_launch_params(); std::tie(op.global, op.local) = get_launch_params();
return op; return op;
...@@ -578,7 +600,74 @@ struct mlir_program ...@@ -578,7 +600,74 @@ struct mlir_program
MIGRAPHX_THROW("Failed to compile mlir program"); MIGRAPHX_THROW("Failed to compile mlir program");
} }
std::string get_tune_params(bool xdlops) { return get_mlir_perf_for_conv(pp, xdlops); } std::string get_tune_params(bool xdlops) const { return get_mlir_perf_for_conv(pp, xdlops); }
// This function appends to tuning cfg file that could be
// used with rocMLIR tuning scripts.
void dump_tuning_cfg(const char* prob_config) const
{
std::string tuning_cfg_path = string_value_of(MIGRAPHX_MLIR_TUNING_CFG{});
if(!tuning_cfg_path.empty())
{
std::vector<std::string> tokens = split_string(prob_config, '\t');
std::string prob = tokens[1];
if(starts_with(prob, "conv"))
{
tuning_cfg_path += ".conv";
}
else
{
tuning_cfg_path += ".gemm";
}
std::ofstream tuning_cfg(tuning_cfg_path, std::ios::app);
tuning_cfg << prob << std::endl;
}
}
static mlir_tuning_table create_tuning_table()
{
mlir_tuning_table tuning_table{mlirRockTuningTableCreate()};
std::string tuning_db_path = string_value_of(MIGRAPHX_MLIR_TUNING_DB{});
if(!tuning_db_path.empty())
{
std::ifstream tuning_db_tsv(tuning_db_path);
if(tuning_db_tsv)
{
std::string line;
while(std::getline(tuning_db_tsv, line))
{
std::vector<std::string> tokens = split_string(line, '\t');
std::string arch = tokens[0];
std::string prob = tokens[1];
std::string perf = tokens[2];
std::string key = arch.append("\t").append(prob);
mlirRockTuningUpdateTable(tuning_table.get(), key.c_str(), perf.c_str(), 1.0);
}
}
}
else
{
std::cerr
<< "WARNING: MLIR tuning db not found. Please set MIGRAPHX_MLIR_TUNING_DB for "
"optimal performance."
<< std::endl;
}
return tuning_table;
}
bool get_module_tuned() const
{
static mlir_tuning_table tuning_table = create_tuning_table();
if(!mlirRockTuningSetFromTable(tuning_table.get(), mmodule.get()))
{
const char* prob_config = mlirRockTuningGetKey(tuning_table.get(), mmodule.get());
std::stringstream key(prob_config);
std::cerr << "fails to set param on" << prob_config << std::endl;
dump_tuning_cfg(prob_config);
return false;
}
return true;
}
mlir_context ctx; mlir_context ctx;
MlirLocation location; MlirLocation location;
...@@ -586,6 +675,7 @@ struct mlir_program ...@@ -586,6 +675,7 @@ struct mlir_program
problem_params pp; problem_params pp;
std::deque<std::string> strings{}; std::deque<std::string> strings{};
std::string target_arch; std::string target_arch;
std::string sym_name;
}; };
std::string dump_mlir(const module& m) std::string dump_mlir(const module& m)
......
...@@ -26,13 +26,13 @@ ...@@ -26,13 +26,13 @@
#include <migraphx/check_context.hpp> #include <migraphx/check_context.hpp>
#include <migraphx/dead_code_elimination.hpp> #include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_allocation.hpp> #include <migraphx/eliminate_allocation.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/eliminate_concat.hpp> #include <migraphx/eliminate_concat.hpp>
#include <migraphx/eliminate_contiguous.hpp> #include <migraphx/eliminate_contiguous.hpp>
#include <migraphx/eliminate_data_type.hpp> #include <migraphx/eliminate_data_type.hpp>
#include <migraphx/eliminate_identity.hpp> #include <migraphx/eliminate_identity.hpp>
#include <migraphx/eliminate_pad.hpp> #include <migraphx/eliminate_pad.hpp>
#include <migraphx/fuse_pointwise.hpp> #include <migraphx/fuse_pointwise.hpp>
#include <migraphx/fuse_reduce.hpp>
#include <migraphx/inline_module.hpp> #include <migraphx/inline_module.hpp>
#include <migraphx/insert_pad.hpp> #include <migraphx/insert_pad.hpp>
#include <migraphx/layout_nhwc.hpp> #include <migraphx/layout_nhwc.hpp>
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
#include <migraphx/normalize_ops.hpp> #include <migraphx/normalize_ops.hpp>
#include <migraphx/optimize_module.hpp> #include <migraphx/optimize_module.hpp>
#include <migraphx/preallocate_param.hpp> #include <migraphx/preallocate_param.hpp>
#include <migraphx/propagate_constant.hpp> #include <migraphx/promote_literals.hpp>
#include <migraphx/register_target.hpp> #include <migraphx/register_target.hpp>
#include <migraphx/replace_allocate.hpp> #include <migraphx/replace_allocate.hpp>
#include <migraphx/rewrite_gelu.hpp> #include <migraphx/rewrite_gelu.hpp>
...@@ -48,9 +48,9 @@ ...@@ -48,9 +48,9 @@
#include <migraphx/rewrite_quantization.hpp> #include <migraphx/rewrite_quantization.hpp>
#include <migraphx/rewrite_rnn.hpp> #include <migraphx/rewrite_rnn.hpp>
#include <migraphx/schedule.hpp> #include <migraphx/schedule.hpp>
#include <migraphx/simplify_algebra.hpp>
#include <migraphx/simplify_qdq.hpp> #include <migraphx/simplify_qdq.hpp>
#include <migraphx/simplify_reshapes.hpp> #include <migraphx/simplify_reshapes.hpp>
#include <migraphx/split_single_dyn_dim.hpp>
#include <migraphx/gpu/allocation_model.hpp> #include <migraphx/gpu/allocation_model.hpp>
#include <migraphx/gpu/compile_miopen.hpp> #include <migraphx/gpu/compile_miopen.hpp>
#include <migraphx/gpu/compile_ops.hpp> #include <migraphx/gpu/compile_ops.hpp>
...@@ -73,8 +73,8 @@ namespace gpu { ...@@ -73,8 +73,8 @@ namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
struct id_pass struct id_pass
{ {
std::string name() const { return "id"; } std::string name() const { return "id"; }
...@@ -102,6 +102,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti ...@@ -102,6 +102,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
// clang-format off // clang-format off
return return
{ {
enable_pass(options.split_single_dyn_dim, split_single_dyn_dim{}),
enable_pass(options.split_single_dyn_dim, dead_code_elimination{}),
normalize_ops{}, normalize_ops{},
dead_code_elimination{}, dead_code_elimination{},
simplify_qdq{}, simplify_qdq{},
...@@ -129,6 +131,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti ...@@ -129,6 +131,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
optimize_module{}, optimize_module{},
enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}), enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
dead_code_elimination{}, dead_code_elimination{},
enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
dead_code_elimination{},
fuse_mlir{&ctx}, fuse_mlir{&ctx},
dead_code_elimination{}, dead_code_elimination{},
lowering{&ctx, options.offload_copy}, lowering{&ctx, options.offload_copy},
...@@ -148,6 +152,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti ...@@ -148,6 +152,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
dead_code_elimination{}, dead_code_elimination{},
compile_ops{&ctx}, compile_ops{&ctx},
dead_code_elimination{}, dead_code_elimination{},
promote_literals{},
dead_code_elimination{},
write_literals{&ctx}, write_literals{&ctx},
schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})}, schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
memory_coloring{"hip::allocate"}, memory_coloring{"hip::allocate"},
......
...@@ -31,10 +31,9 @@ set_target_properties(migraphx_ref PROPERTIES EXPORT_NAME ref) ...@@ -31,10 +31,9 @@ set_target_properties(migraphx_ref PROPERTIES EXPORT_NAME ref)
rocm_set_soversion(migraphx_ref ${MIGRAPHX_SO_VERSION}) rocm_set_soversion(migraphx_ref ${MIGRAPHX_SO_VERSION})
find_path(BLAZE_INCLUDE blaze/Blaze.h) find_path(BLAZE_INCLUDE blaze/Blaze.h)
find_package(Threads)
rocm_clang_tidy_check(migraphx_ref) rocm_clang_tidy_check(migraphx_ref)
target_link_libraries(migraphx_ref migraphx Threads::Threads) target_link_libraries(migraphx_ref PUBLIC migraphx)
target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE}) target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS) target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)
......
...@@ -46,8 +46,6 @@ struct target ...@@ -46,8 +46,6 @@ struct target
argument allocate(const shape& s) const; argument allocate(const shape& s) const;
}; };
MIGRAPHX_REGISTER_TARGET(target);
} // namespace ref } // namespace ref
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx } // namespace migraphx
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment