"model/models/vscode:/vscode.git/clone" did not exist on "63a394068c44149252b061a1aded05b0530868a8"
Unverified Commit 394a7495 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Merge branch 'master' into renameGemmToDot

parents a3057ebf 30371001
...@@ -14,9 +14,9 @@ shape hip_add::compute_shape(const std::vector<shape>& inputs) const ...@@ -14,9 +14,9 @@ shape hip_add::compute_shape(const std::vector<shape>& inputs) const
return inputs.at(0); return inputs.at(0);
} }
argument hip_add::compute(context&, const shape&, const std::vector<argument>& args) const argument hip_add::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{ {
device::add(args[2], args[0], args[1]); device::add(ctx.get_stream().get(), args[2], args[0], args[1]);
return args[2]; return args[2];
} }
...@@ -34,7 +34,7 @@ argument miopen_add::compute(context& ctx, ...@@ -34,7 +34,7 @@ argument miopen_add::compute(context& ctx,
auto a_desc = make_tensor(args[0].get_shape()); auto a_desc = make_tensor(args[0].get_shape());
auto b_desc = make_tensor(args[1].get_shape()); auto b_desc = make_tensor(args[1].get_shape());
auto c_desc = make_tensor(output_shape); auto c_desc = make_tensor(output_shape);
miopenOpTensor(ctx.handle.get(), miopenOpTensor(ctx.get_stream().get_miopen(),
miopenTensorOpAdd, miopenTensorOpAdd,
&alpha, &alpha,
a_desc.get(), a_desc.get(),
......
...@@ -23,7 +23,7 @@ argument miopen_batch_norm_inference::compute(context& ctx, ...@@ -23,7 +23,7 @@ argument miopen_batch_norm_inference::compute(context& ctx,
float alpha = 1.0, beta = 0.0f; float alpha = 1.0, beta = 0.0f;
miopenBatchNormalizationForwardInference(ctx.handle.get(), miopenBatchNormalizationForwardInference(ctx.get_stream().get_miopen(),
miopenBatchNormMode_t(op.bn_mode), miopenBatchNormMode_t(op.bn_mode),
&alpha, &alpha,
&beta, &beta,
......
...@@ -14,11 +14,12 @@ shape hip_concat::compute_shape(std::vector<shape> inputs) const ...@@ -14,11 +14,12 @@ shape hip_concat::compute_shape(std::vector<shape> inputs) const
return op.compute_shape(inputs); return op.compute_shape(inputs);
} }
argument argument hip_concat::compute(context& ctx,
hip_concat::compute(context&, const shape& output_shape, const std::vector<argument>& args) const const shape& output_shape,
const std::vector<argument>& args) const
{ {
std::vector<std::size_t> offsets = op.compute_offsets(output_shape, args); std::vector<std::size_t> offsets = op.compute_offsets(output_shape, args);
return device::concat(output_shape, args, offsets); return device::concat(ctx.get_stream().get(), output_shape, args, offsets);
} }
} // namespace gpu } // namespace gpu
......
...@@ -12,13 +12,14 @@ shape miopen_contiguous::compute_shape(const std::vector<shape>& inputs) const ...@@ -12,13 +12,14 @@ shape miopen_contiguous::compute_shape(const std::vector<shape>& inputs) const
check_shapes{inputs, *this}.has(2); check_shapes{inputs, *this}.has(2);
return op.compute_shape({inputs.at(0)}); return op.compute_shape({inputs.at(0)});
} }
argument argument miopen_contiguous::compute(context& ctx,
miopen_contiguous::compute(context&, shape output_shape, const std::vector<argument>& args) const shape output_shape,
const std::vector<argument>& args) const
{ {
assert(output_shape == args[1].get_shape()); assert(output_shape == args[1].get_shape());
assert(output_shape.standard()); assert(output_shape.standard());
(void)output_shape; (void)output_shape;
device::contiguous(args.at(1), args.at(0)); device::contiguous(ctx.get_stream().get(), args.at(1), args.at(0));
return args.at(1); return args.at(1);
} }
......
...@@ -21,7 +21,7 @@ argument miopen_convolution::compute(context& ctx, ...@@ -21,7 +21,7 @@ argument miopen_convolution::compute(context& ctx,
auto y_desc = make_tensor(output_shape); auto y_desc = make_tensor(output_shape);
float alpha = 1, beta = 0; float alpha = 1, beta = 0;
miopenConvolutionForward(ctx.handle.get(), miopenConvolutionForward(ctx.get_stream().get_miopen(),
&alpha, &alpha,
x_desc.get(), x_desc.get(),
args[0].implicit(), args[0].implicit(),
...@@ -47,18 +47,22 @@ shape miopen_convolution::compile(context& ctx, ...@@ -47,18 +47,22 @@ shape miopen_convolution::compile(context& ctx,
auto y_desc = make_tensor(output_shape); auto y_desc = make_tensor(output_shape);
std::size_t workspace_size = 0; std::size_t workspace_size = 0;
miopenConvolutionForwardGetWorkSpaceSize( miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
ctx.handle.get(), w_desc.get(), x_desc.get(), cd.get(), y_desc.get(), &workspace_size); w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
&workspace_size);
workspace_shape = shape{shape::int8_type, {workspace_size}}; workspace_shape = shape{shape::int8_type, {workspace_size}};
auto x = to_gpu(generate_argument(inputs[0]->get_shape())); auto x = to_gpu(generate_argument(inputs[0]->get_shape()));
auto w = to_gpu(generate_argument(inputs[1]->get_shape())); auto w = to_gpu(generate_argument(inputs[1]->get_shape()));
auto y = to_gpu(generate_argument(output_shape)); auto y = allocate_gpu(output_shape);
auto workspace = allocate_gpu(workspace_shape); auto workspace = allocate_gpu(workspace_shape);
int algo_count = 1; int algo_count = 1;
miopenConvAlgoPerf_t perf; miopenConvAlgoPerf_t perf;
miopenFindConvolutionForwardAlgorithm(ctx.handle.get(), miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
x_desc.get(), x_desc.get(),
x.implicit(), x.implicit(),
w_desc.get(), w_desc.get(),
......
...@@ -5,14 +5,18 @@ namespace migraph { ...@@ -5,14 +5,18 @@ namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
void add(const argument& result, const argument& arg1, const argument& arg2) void add(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
{ {
nary(result, arg1, arg2)([](auto x, auto y) { return x + y; }); nary(stream, result, arg1, arg2)([](auto x, auto y) { return x + y; });
} }
void add(const argument& result, const argument& arg1, const argument& arg2, const argument& arg3) void add(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{ {
nary(result, arg1, arg2, arg3)([](auto x, auto y, auto z) { return x + y + z; }); nary(stream, result, arg1, arg2, arg3)([](auto x, auto y, auto z) { return x + y + z; });
} }
} // namespace device } // namespace device
......
...@@ -5,17 +5,22 @@ namespace migraph { ...@@ -5,17 +5,22 @@ namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
void add_relu(const argument& result, const argument& arg1, const argument& arg2) void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2)
{ {
nary(result, arg1, arg2)([](auto x, auto y) { return std::max<decltype(x + y)>(0, x + y); }); nary(stream, result, arg1, arg2)(
[](auto x, auto y) { return std::max<decltype(x + y)>(0, x + y); });
} }
void add_relu(const argument& result, void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1, const argument& arg1,
const argument& arg2, const argument& arg2,
const argument& arg3) const argument& arg3)
{ {
nary(result, arg1, arg2, arg3)( nary(stream, result, arg1, arg2, arg3)(
[](auto x, auto y, auto z) { return std::max<decltype(x + y + z)>(0, x + y + z); }); [](auto x, auto y, auto z) { return std::max<decltype(x + y + z)>(0, x + y + z); });
} }
......
...@@ -8,11 +8,11 @@ namespace migraph { ...@@ -8,11 +8,11 @@ namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
argument concat(const migraph::shape& output_shape, argument concat(hipStream_t stream,
const migraph::shape& output_shape,
std::vector<migraph::argument> args, std::vector<migraph::argument> args,
std::vector<std::size_t> offsets) std::vector<std::size_t> offsets)
{ {
// migraph::argument& result = args.back();
for(std::size_t l = 0; l < args.size() - 1; l++) for(std::size_t l = 0; l < args.size() - 1; l++)
{ {
auto argl = args[l]; auto argl = args[l];
...@@ -23,12 +23,11 @@ argument concat(const migraph::shape& output_shape, ...@@ -23,12 +23,11 @@ argument concat(const migraph::shape& output_shape,
const auto* inptr = input.data(); const auto* inptr = input.data();
hip_tensor_descriptor<ndim> desc_input(input.get_shape()); hip_tensor_descriptor<ndim> desc_input(input.get_shape());
hip_tensor_descriptor<ndim> desc_output(output.get_shape()); hip_tensor_descriptor<ndim> desc_output(output.get_shape());
gs_launch(nelements)( gs_launch(stream, nelements)(
[=](auto i) { outptr[desc_output.linear(desc_input.multi(i))] = inptr[i]; }); [=](auto i) { outptr[desc_output.linear(desc_input.multi(i))] = inptr[i]; });
}); });
}); });
} }
// return result;
return args.back(); return args.back();
} }
......
...@@ -6,9 +6,9 @@ namespace migraph { ...@@ -6,9 +6,9 @@ namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
void contiguous(argument result, argument arg) void contiguous(hipStream_t stream, argument result, argument arg)
{ {
nary_nonstandard(std::move(result), std::move(arg))([](auto x) { return x; }); nary_nonstandard(stream, std::move(result), std::move(arg))([](auto x) { return x; });
} }
} // namespace device } // namespace device
......
...@@ -21,7 +21,7 @@ __global__ void launcher(F f) ...@@ -21,7 +21,7 @@ __global__ void launcher(F f)
f(idx); f(idx);
} }
inline auto launch(std::size_t global, std::size_t local) inline auto launch(hipStream_t stream, std::size_t global, std::size_t local)
{ {
return [=](auto f) { return [=](auto f) {
assert(local > 0); assert(local > 0);
...@@ -29,17 +29,17 @@ inline auto launch(std::size_t global, std::size_t local) ...@@ -29,17 +29,17 @@ inline auto launch(std::size_t global, std::size_t local)
using f_type = decltype(f); using f_type = decltype(f);
dim3 nblocks(global / local); dim3 nblocks(global / local);
dim3 nthreads(local); dim3 nthreads(local);
hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, nullptr, f); hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, stream, f);
}; };
} }
inline auto gs_launch(std::size_t n, std::size_t local = 1024) inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
{ {
std::size_t groups = 1 + n / local; std::size_t groups = 1 + n / local;
std::size_t nglobal = std::min<std::size_t>(256, groups) * local; std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
return [=](auto f) { return [=](auto f) {
launch(nglobal, local)([=](auto idx) { launch(stream, nglobal, local)([=](auto idx) {
for(size_t i = idx.global; i < n; i += nglobal) for(size_t i = idx.global; i < n; i += nglobal)
{ {
f(i); f(i);
......
...@@ -32,7 +32,7 @@ auto pack_vec4(Ts... xs) ...@@ -32,7 +32,7 @@ auto pack_vec4(Ts... xs)
} }
template <class F, class... Arguments> template <class F, class... Arguments>
auto nary_nonstandard_impl(F f, argument result, Arguments... args) auto nary_nonstandard_impl(hipStream_t stream, F f, argument result, Arguments... args)
{ {
const auto& output_shape = result.get_shape(); const auto& output_shape = result.get_shape();
visit_all(result, args...)([&](auto output, auto... inputs) { visit_all(result, args...)([&](auto output, auto... inputs) {
...@@ -41,7 +41,7 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args) ...@@ -41,7 +41,7 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
std::make_pair(hip_tensor_descriptor<ndim>{inputs.get_shape()}, inputs.data())...); std::make_pair(hip_tensor_descriptor<ndim>{inputs.get_shape()}, inputs.data())...);
hip_tensor_descriptor<ndim> out_desc(output_shape); hip_tensor_descriptor<ndim> out_desc(output_shape);
auto* outp = output.data(); auto* outp = output.data();
gs_launch(output_shape.elements())([=](auto i) { gs_launch(stream, output_shape.elements())([=](auto i) {
data([&](auto&&... ps) { data([&](auto&&... ps) {
auto outidx = out_desc.multi(i); auto outidx = out_desc.multi(i);
outp[i] = f(ps.second[ps.first.linear(outidx)]...); outp[i] = f(ps.second[ps.first.linear(outidx)]...);
...@@ -52,8 +52,12 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args) ...@@ -52,8 +52,12 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
} }
template <class F> template <class F>
void trinary_broadcast_vec_impl( void trinary_broadcast_vec_impl(hipStream_t stream,
F f, const argument& result, const argument& arg1, const argument& arg2, const argument& arg3) F f,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{ {
const auto& output_shape = result.get_shape(); const auto& output_shape = result.get_shape();
const auto& b_shape = arg3.get_shape(); const auto& b_shape = arg3.get_shape();
...@@ -79,7 +83,7 @@ void trinary_broadcast_vec_impl( ...@@ -79,7 +83,7 @@ void trinary_broadcast_vec_impl(
const std::size_t n = output.size() / vec_size; const std::size_t n = output.size() / vec_size;
const std::size_t bdim_vec_len = bdim_len / vec_size; const std::size_t bdim_vec_len = bdim_len / vec_size;
launch(nglobal, nlocal)([=](auto idx) __device__ { launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size]; MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
// Load bias into LDS // Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal) for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
...@@ -107,8 +111,12 @@ void trinary_broadcast_vec_impl( ...@@ -107,8 +111,12 @@ void trinary_broadcast_vec_impl(
} }
template <class F> template <class F>
void trinary_broadcast_impl( void trinary_broadcast_impl(hipStream_t stream,
F f, const argument& result, const argument& arg1, const argument& arg2, const argument& arg3) F f,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{ {
const auto& output_shape = result.get_shape(); const auto& output_shape = result.get_shape();
const auto& b_shape = arg3.get_shape(); const auto& b_shape = arg3.get_shape();
...@@ -132,7 +140,7 @@ void trinary_broadcast_impl( ...@@ -132,7 +140,7 @@ void trinary_broadcast_impl(
const std::size_t nglobal = 256 * nlocal; const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size(); const std::size_t n = output.size();
launch(nglobal, nlocal)([=](auto idx) __device__ { launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED type buffer[2048]; MIGRAPH_DEVICE_SHARED type buffer[2048];
// Load bias into LDS // Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal) for(size_t i = idx.local; i < bdim_len; i += nlocal)
...@@ -154,10 +162,8 @@ void trinary_broadcast_impl( ...@@ -154,10 +162,8 @@ void trinary_broadcast_impl(
} }
template <class F> template <class F>
void binary_broadcast_vec_impl(F f, void binary_broadcast_vec_impl(
const argument& result, hipStream_t stream, F f, const argument& result, const argument& arg1, const argument& arg2)
const argument& arg1,
const argument& arg2)
{ {
const auto& output_shape = result.get_shape(); const auto& output_shape = result.get_shape();
const auto& b_shape = arg2.get_shape(); const auto& b_shape = arg2.get_shape();
...@@ -182,7 +188,7 @@ void binary_broadcast_vec_impl(F f, ...@@ -182,7 +188,7 @@ void binary_broadcast_vec_impl(F f,
const std::size_t n = output.size() / vec_size; const std::size_t n = output.size() / vec_size;
const std::size_t bdim_vec_len = bdim_len / vec_size; const std::size_t bdim_vec_len = bdim_len / vec_size;
launch(nglobal, nlocal)([=](auto idx) __device__ { launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size]; MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
// Load bias into LDS // Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal) for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
...@@ -209,7 +215,8 @@ void binary_broadcast_vec_impl(F f, ...@@ -209,7 +215,8 @@ void binary_broadcast_vec_impl(F f,
} }
template <class F> template <class F>
void binary_broadcast_impl(F f, const argument& result, const argument& arg1, const argument& arg2) void binary_broadcast_impl(
hipStream_t stream, F f, const argument& result, const argument& arg1, const argument& arg2)
{ {
const auto& output_shape = result.get_shape(); const auto& output_shape = result.get_shape();
const auto& b_shape = arg2.get_shape(); const auto& b_shape = arg2.get_shape();
...@@ -232,7 +239,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co ...@@ -232,7 +239,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
const std::size_t nglobal = 256 * nlocal; const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size(); const std::size_t n = output.size();
launch(nglobal, nlocal)([=](auto idx) __device__ { launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED type buffer[2048]; MIGRAPH_DEVICE_SHARED type buffer[2048];
// Load bias into LDS // Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal) for(size_t i = idx.local; i < bdim_len; i += nlocal)
...@@ -253,7 +260,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co ...@@ -253,7 +260,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
} }
template <class F, class... Arguments> template <class F, class... Arguments>
void nary_standard_vec_impl(F f, argument result, Arguments... args) void nary_standard_vec_impl(hipStream_t stream, F f, argument result, Arguments... args)
{ {
// assert(x.get_shape().elements() == y.get_shape().elements()); // assert(x.get_shape().elements() == y.get_shape().elements());
const auto& output_shape = result.get_shape(); const auto& output_shape = result.get_shape();
...@@ -262,7 +269,7 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args) ...@@ -262,7 +269,7 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
const std::size_t vec_size = 4; const std::size_t vec_size = 4;
auto data = pack_vec4(inputs.data()...); auto data = pack_vec4(inputs.data()...);
auto* outp = as_vec4(output.data()); auto* outp = as_vec4(output.data());
gs_launch(output_shape.elements() / vec_size)([=](auto i) { gs_launch(stream, output_shape.elements() / vec_size)([=](auto i) {
vec4<type> out = outp[i]; vec4<type> out = outp[i];
data( data(
[&](auto... xs) { [&](auto... xs) {
...@@ -278,50 +285,51 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args) ...@@ -278,50 +285,51 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
} }
template <class F, class... Arguments> template <class F, class... Arguments>
void nary_standard_impl(F f, argument result, Arguments... args) void nary_standard_impl(hipStream_t stream, F f, argument result, Arguments... args)
{ {
// assert(x.get_shape().elements() == y.get_shape().elements()); // assert(x.get_shape().elements() == y.get_shape().elements());
const auto& output_shape = result.get_shape(); const auto& output_shape = result.get_shape();
visit_all(result, args...)([&](auto output, auto... inputs) { visit_all(result, args...)([&](auto output, auto... inputs) {
auto data = pack(inputs.data()...); auto data = pack(inputs.data()...);
auto* outp = output.data(); auto* outp = output.data();
gs_launch(output_shape.elements())( gs_launch(stream, output_shape.elements())(
[=](auto i) { data([&](auto... xps) { outp[i] = f(xps[i]...); }); }); [=](auto i) { data([&](auto... xps) { outp[i] = f(xps[i]...); }); });
}); });
} }
template <class F, class... Arguments> template <class F, class... Arguments>
void nary_impl(F f, argument result, Arguments... args) void nary_impl(hipStream_t stream, F f, argument result, Arguments... args)
{ {
bool standard = all_of({args.get_shape()...}, [](const shape& s) { return s.standard(); }); bool standard = all_of({args.get_shape()...}, [](const shape& s) { return s.standard(); });
bool packed = all_of({args.get_shape()...}, [](const shape& s) { return s.packed(); }); bool packed = all_of({args.get_shape()...}, [](const shape& s) { return s.packed(); });
bool same_shapes = bool same_shapes =
all_of({args.get_shape()...}, [&](const shape& s) { return s == result.get_shape(); }); all_of({args.get_shape()...}, [&](const shape& s) { return s == result.get_shape(); });
if(standard or (packed and same_shapes)) if(standard or (packed and same_shapes))
nary_standard_impl(f, result, args...); nary_standard_impl(stream, f, result, args...);
else else
nary_nonstandard_impl(f, result, args...); nary_nonstandard_impl(stream, f, result, args...);
} }
template <class... Arguments> template <class... Arguments>
auto nary_nonstandard(argument result, Arguments... args) auto nary_nonstandard(hipStream_t stream, argument result, Arguments... args)
{ {
return [=](auto f) { nary_nonstandard_impl(f, result, args...); }; return [=](auto f) { nary_nonstandard_impl(stream, f, result, args...); };
} }
template <class... Arguments> template <class... Arguments>
auto nary_standard(argument result, Arguments... args) auto nary_standard(hipStream_t stream, argument result, Arguments... args)
{ {
return [=](auto f) { nary_standard_impl(f, result, args...); }; return [=](auto f) { nary_standard_impl(stream, f, result, args...); };
} }
template <class... Arguments> template <class... Arguments>
auto nary(argument result, Arguments... args) auto nary(hipStream_t stream, argument result, Arguments... args)
{ {
return [=](auto f) { nary_impl(f, result, args...); }; return [=](auto f) { nary_impl(stream, f, result, args...); };
} }
inline auto nary(const argument& result, const argument& arg1, const argument& arg2) inline auto
nary(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
{ {
return [=](auto f) { return [=](auto f) {
// TODO: Check result and arg1 shape is the same // TODO: Check result and arg1 shape is the same
...@@ -339,18 +347,21 @@ inline auto nary(const argument& result, const argument& arg1, const argument& a ...@@ -339,18 +347,21 @@ inline auto nary(const argument& result, const argument& arg1, const argument& a
const bool divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and const bool divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and
(arg1.get_shape().elements() % 4 == 0); (arg1.get_shape().elements() % 4 == 0);
if(divisible_by_4) if(divisible_by_4)
binary_broadcast_vec_impl(f, result, arg1, arg2); binary_broadcast_vec_impl(stream, f, result, arg1, arg2);
else else
binary_broadcast_impl(f, result, arg1, arg2); binary_broadcast_impl(stream, f, result, arg1, arg2);
return; return;
} }
} }
nary_impl(f, result, arg1, arg2); nary_impl(stream, f, result, arg1, arg2);
}; };
} }
inline auto inline auto nary(hipStream_t stream,
nary(const argument& result, const argument& arg1, const argument& arg2, const argument& arg3) const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{ {
return [=](auto f) { return [=](auto f) {
// TODO: Check result and arg1 shape is the same // TODO: Check result and arg1 shape is the same
...@@ -369,13 +380,13 @@ nary(const argument& result, const argument& arg1, const argument& arg2, const a ...@@ -369,13 +380,13 @@ nary(const argument& result, const argument& arg1, const argument& arg2, const a
const bool divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and const bool divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and
(arg1.get_shape().elements() % 4 == 0); (arg1.get_shape().elements() % 4 == 0);
if(divisible_by_4) if(divisible_by_4)
trinary_broadcast_vec_impl(f, result, arg1, arg2, arg3); trinary_broadcast_vec_impl(stream, f, result, arg1, arg2, arg3);
else else
trinary_broadcast_impl(f, result, arg1, arg2, arg3); trinary_broadcast_impl(stream, f, result, arg1, arg2, arg3);
return; return;
} }
} }
nary_impl(f, result, arg1, arg2, arg3); nary_impl(stream, f, result, arg1, arg2, arg3);
}; };
} }
......
...@@ -82,13 +82,14 @@ struct fusion ...@@ -82,13 +82,14 @@ struct fusion
// int algo_count = 1; // int algo_count = 1;
// miopenConvFwdAlgorithm_t algo; // miopenConvFwdAlgorithm_t algo;
// miopenFusionPlanConvolutionGetAlgo(fp.get(), 1, &algo_count, &algo); // miopenFusionPlanConvolutionGetAlgo(fp.get(), 1, &algo_count, &algo);
// miopenFusionPlanGetWorkSpaceSize(ctx.handle.get(), fp.get(), &ws_size, algo); // miopenFusionPlanGetWorkSpaceSize(ctx.get_stream().get_miopen(), fp.get(), &ws_size,
// algo);
return shape{shape::int8_type, {ws_size}}; return shape{shape::int8_type, {ws_size}};
} }
void compile(context& ctx) void compile(context& ctx)
{ {
auto status = miopenCompileFusionPlan(ctx.handle.get(), fp.get()); auto status = miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get());
if(status != miopenStatusSuccess) if(status != miopenStatusSuccess)
MIGRAPH_THROW("Compiling fusion plan failed"); MIGRAPH_THROW("Compiling fusion plan failed");
} }
...@@ -100,7 +101,7 @@ struct fusion ...@@ -100,7 +101,7 @@ struct fusion
{ {
auto x_td = make_tensor(x.get_shape()); auto x_td = make_tensor(x.get_shape());
auto y_td = make_tensor(y.get_shape()); auto y_td = make_tensor(y.get_shape());
auto status = miopenExecuteFusionPlan(ctx.handle.get(), auto status = miopenExecuteFusionPlan(ctx.get_stream().get_miopen(),
fp.get(), fp.get(),
x_td.get(), x_td.get(),
x.implicit(), x.implicit(),
...@@ -152,9 +153,9 @@ struct hip_triadd ...@@ -152,9 +153,9 @@ struct hip_triadd
check_shapes{inputs, *this}.has(4); check_shapes{inputs, *this}.has(4);
return inputs.front(); return inputs.front();
} }
argument compute(context&, const shape&, const std::vector<argument>& args) const argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{ {
device::add(args.at(3), args.at(0), args.at(1), args.at(2)); device::add(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
return args.at(3); return args.at(3);
} }
}; };
...@@ -167,9 +168,9 @@ struct hip_triadd_relu ...@@ -167,9 +168,9 @@ struct hip_triadd_relu
check_shapes{inputs, *this}.has(4); check_shapes{inputs, *this}.has(4);
return inputs.front(); return inputs.front();
} }
argument compute(context&, const shape&, const std::vector<argument>& args) const argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{ {
device::add_relu(args.at(3), args.at(0), args.at(1), args.at(2)); device::add_relu(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
return args.at(3); return args.at(3);
} }
}; };
...@@ -182,9 +183,9 @@ struct hip_add_relu ...@@ -182,9 +183,9 @@ struct hip_add_relu
check_shapes{inputs, *this}.has(3); check_shapes{inputs, *this}.has(3);
return inputs.front(); return inputs.front();
} }
argument compute(context&, const shape&, const std::vector<argument>& args) const argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{ {
device::add_relu(args.at(2), args.at(0), args.at(1)); device::add_relu(ctx.get_stream().get(), args.at(2), args.at(0), args.at(1));
return args.at(2); return args.at(2);
} }
}; };
......
...@@ -26,7 +26,7 @@ argument miopen_gemm::compute(context& ctx, ...@@ -26,7 +26,7 @@ argument miopen_gemm::compute(context& ctx,
rocblas_int m = output_shape.lens()[0]; rocblas_int m = output_shape.lens()[0];
rocblas_int n = output_shape.lens()[1]; rocblas_int n = output_shape.lens()[1];
rocblas_int k = args[0].get_shape().lens()[1]; rocblas_int k = args[0].get_shape().lens()[1];
rocblas_sgemm(ctx.rbhandle.get(), rocblas_sgemm(ctx.get_stream().get_rocblas(),
transb ? rocblas_operation_transpose : rocblas_operation_none, transb ? rocblas_operation_transpose : rocblas_operation_none,
transa ? rocblas_operation_transpose : rocblas_operation_none, transa ? rocblas_operation_transpose : rocblas_operation_none,
n, n,
......
...@@ -38,14 +38,6 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false) ...@@ -38,14 +38,6 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
return hip_ptr{result}; return hip_ptr{result};
} }
template <class T>
hip_ptr write_to_gpu(const T& x)
{
using type = typename T::value_type;
auto size = x.size() * sizeof(type);
return write_to_gpu(x.data(), size);
}
template <class T> template <class T>
std::vector<T> read_from_gpu(const void* x, std::size_t sz) std::vector<T> read_from_gpu(const void* x, std::size_t sz)
{ {
...@@ -65,6 +57,14 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false) ...@@ -65,6 +57,14 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
return result; return result;
} }
template <class T>
hip_ptr write_to_gpu(const T& x)
{
using type = typename T::value_type;
auto size = x.size() * sizeof(type);
return write_to_gpu(x.data(), size);
}
argument allocate_gpu(const shape& s, bool host) argument allocate_gpu(const shape& s, bool host)
{ {
auto p = share(allocate_gpu(s.bytes() + 1, host)); auto p = share(allocate_gpu(s.bytes() + 1, host));
...@@ -88,6 +88,13 @@ argument from_gpu(argument arg) ...@@ -88,6 +88,13 @@ argument from_gpu(argument arg)
return result; return result;
} }
void set_device(std::size_t id)
{
auto status = hipSetDevice(id);
if(status != hipSuccess)
MIGRAPH_THROW("Error setting device");
}
void gpu_sync() { hipDeviceSynchronize(); } void gpu_sync() { hipDeviceSynchronize(); }
void copy_to_gpu(argument src, argument dst) void copy_to_gpu(argument src, argument dst)
......
...@@ -4,17 +4,114 @@ ...@@ -4,17 +4,114 @@
#include <migraph/gpu/miopen.hpp> #include <migraph/gpu/miopen.hpp>
#include <migraph/gpu/rocblas.hpp> #include <migraph/gpu/rocblas.hpp>
#include <migraph/gpu/hip.hpp> #include <migraph/gpu/hip.hpp>
#include <migraph/env.hpp>
namespace migraph { namespace migraph {
namespace gpu { namespace gpu {
MIGRAPH_DECLARE_ENV_VAR(MIGRAPH_DISABLE_NULL_STREAM)
struct hip_device
{
hip_device() { add_stream(); }
hip_device(std::size_t id) : device_id(id) { add_stream(); }
struct stream
{
using hip_stream_ptr = MIGRAPH_MANAGE_PTR(hipStream_t, hipStreamDestroy);
stream() {}
stream(std::size_t device_number) : id(device_number) {}
void setup() { set_device(id); }
static hip_stream_ptr create_stream()
{
hipStream_t result = nullptr;
auto status = hipStreamCreate(&result);
if(status != hipSuccess)
MIGRAPH_THROW("Failed to allocate stream");
return hip_stream_ptr{result};
}
hipStream_t get()
{
if(enabled(MIGRAPH_DISABLE_NULL_STREAM{}))
{
setup();
if(s == nullptr)
s = create_stream();
assert(s.get() != nullptr);
return s.get();
}
return nullptr;
}
auto create_miopen_handle()
{
if(enabled(MIGRAPH_DISABLE_NULL_STREAM{}))
return make_obj<miopen_handle>(&miopenCreateWithStream, get());
else
return make_obj<miopen_handle>(&miopenCreate);
}
auto get_miopen()
{
setup();
if(mihandle == nullptr)
mihandle = create_miopen_handle();
assert(mihandle.get() != nullptr);
return mihandle.get();
}
auto get_rocblas()
{
setup();
if(rbhandle == nullptr)
rbhandle = create_rocblas_handle_ptr(get());
assert(rbhandle.get() != nullptr);
return rbhandle.get();
}
private:
std::size_t id = 0;
shared<hip_stream_ptr> s = nullptr;
shared<miopen_handle> mihandle = nullptr;
shared<rocblas_handle_ptr> rbhandle = nullptr;
};
void add_stream() { streams.emplace_back(device_id); }
stream& get_stream() { return streams.at(current_stream); }
void set_stream(std::size_t n) { current_stream = n; }
private:
std::size_t device_id = 0;
std::size_t current_stream = 0;
std::vector<stream> streams;
};
struct context struct context
{ {
shared<miopen_handle> handle; context(std::size_t n = 0) : current_device(std::make_shared<hip_device>(n)) {}
shared<rocblas_handle_ptr> rbhandle;
argument scratch; hip_device& get_current_device()
{
assert(current_device != nullptr);
return *current_device;
}
hip_device::stream& get_stream() { return get_current_device().get_stream(); }
std::vector<argument> literals{}; std::vector<argument> literals{};
void finish() const { gpu_sync(); } void finish() const { gpu_sync(); }
private:
// TODO: Make this a vector to support multiple devices
std::shared_ptr<hip_device> current_device;
}; };
} // namespace gpu } // namespace gpu
} // namespace migraph } // namespace migraph
......
...@@ -3,14 +3,19 @@ ...@@ -3,14 +3,19 @@
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_HPP #define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_HPP
#include <migraph/argument.hpp> #include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph { namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
void add(const argument& result, const argument& arg1, const argument& arg2); void add(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2);
void add(const argument& result, const argument& arg1, const argument& arg2, const argument& arg3); void add(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3);
} // namespace device } // namespace device
} // namespace gpu } // namespace gpu
......
...@@ -3,14 +3,19 @@ ...@@ -3,14 +3,19 @@
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP #define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
#include <migraph/argument.hpp> #include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph { namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
void add_relu(const argument& result, const argument& arg1, const argument& arg2); void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2);
void add_relu(const argument& result, void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1, const argument& arg1,
const argument& arg2, const argument& arg2,
const argument& arg3); const argument& arg3);
......
#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP #ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#define MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP #define MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph { namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
argument argument concat(hipStream_t stream,
concat(const shape& output_shape, std::vector<argument> args, std::vector<std::size_t> offsets); const shape& output_shape,
std::vector<argument> args,
std::vector<std::size_t> offsets);
} // namespace device } // namespace device
} // namespace gpu } // namespace gpu
......
...@@ -2,12 +2,13 @@ ...@@ -2,12 +2,13 @@
#define MIGRAPH_GUARD_MIGRAPHLIB_KERNELS_HPP #define MIGRAPH_GUARD_MIGRAPHLIB_KERNELS_HPP
#include <migraph/argument.hpp> #include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph { namespace migraph {
namespace gpu { namespace gpu {
namespace device { namespace device {
void contiguous(argument result, argument arg); void contiguous(hipStream_t stream, argument result, argument arg);
} // namespace device } // namespace device
} // namespace gpu } // namespace gpu
......
...@@ -13,6 +13,8 @@ migraph::argument to_gpu(migraph::argument arg, bool host = false); ...@@ -13,6 +13,8 @@ migraph::argument to_gpu(migraph::argument arg, bool host = false);
migraph::argument from_gpu(migraph::argument arg); migraph::argument from_gpu(migraph::argument arg);
void set_device(std::size_t id);
void gpu_sync(); void gpu_sync();
void copy_to_gpu(argument src, argument dst); void copy_to_gpu(argument src, argument dst);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment