Unverified Commit 394a7495 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Merge branch 'master' into renameGemmToDot

parents a3057ebf 30371001
......@@ -14,9 +14,9 @@ shape hip_add::compute_shape(const std::vector<shape>& inputs) const
return inputs.at(0);
}
argument hip_add::compute(context&, const shape&, const std::vector<argument>& args) const
argument hip_add::compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::add(args[2], args[0], args[1]);
device::add(ctx.get_stream().get(), args[2], args[0], args[1]);
return args[2];
}
......@@ -34,7 +34,7 @@ argument miopen_add::compute(context& ctx,
auto a_desc = make_tensor(args[0].get_shape());
auto b_desc = make_tensor(args[1].get_shape());
auto c_desc = make_tensor(output_shape);
miopenOpTensor(ctx.handle.get(),
miopenOpTensor(ctx.get_stream().get_miopen(),
miopenTensorOpAdd,
&alpha,
a_desc.get(),
......
......@@ -23,7 +23,7 @@ argument miopen_batch_norm_inference::compute(context& ctx,
float alpha = 1.0, beta = 0.0f;
miopenBatchNormalizationForwardInference(ctx.handle.get(),
miopenBatchNormalizationForwardInference(ctx.get_stream().get_miopen(),
miopenBatchNormMode_t(op.bn_mode),
&alpha,
&beta,
......
......@@ -14,11 +14,12 @@ shape hip_concat::compute_shape(std::vector<shape> inputs) const
return op.compute_shape(inputs);
}
argument
hip_concat::compute(context&, const shape& output_shape, const std::vector<argument>& args) const
argument hip_concat::compute(context& ctx,
const shape& output_shape,
const std::vector<argument>& args) const
{
std::vector<std::size_t> offsets = op.compute_offsets(output_shape, args);
return device::concat(output_shape, args, offsets);
return device::concat(ctx.get_stream().get(), output_shape, args, offsets);
}
} // namespace gpu
......
......@@ -12,13 +12,14 @@ shape miopen_contiguous::compute_shape(const std::vector<shape>& inputs) const
check_shapes{inputs, *this}.has(2);
return op.compute_shape({inputs.at(0)});
}
argument
miopen_contiguous::compute(context&, shape output_shape, const std::vector<argument>& args) const
argument miopen_contiguous::compute(context& ctx,
shape output_shape,
const std::vector<argument>& args) const
{
assert(output_shape == args[1].get_shape());
assert(output_shape.standard());
(void)output_shape;
device::contiguous(args.at(1), args.at(0));
device::contiguous(ctx.get_stream().get(), args.at(1), args.at(0));
return args.at(1);
}
......
......@@ -21,7 +21,7 @@ argument miopen_convolution::compute(context& ctx,
auto y_desc = make_tensor(output_shape);
float alpha = 1, beta = 0;
miopenConvolutionForward(ctx.handle.get(),
miopenConvolutionForward(ctx.get_stream().get_miopen(),
&alpha,
x_desc.get(),
args[0].implicit(),
......@@ -47,18 +47,22 @@ shape miopen_convolution::compile(context& ctx,
auto y_desc = make_tensor(output_shape);
std::size_t workspace_size = 0;
miopenConvolutionForwardGetWorkSpaceSize(
ctx.handle.get(), w_desc.get(), x_desc.get(), cd.get(), y_desc.get(), &workspace_size);
miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
w_desc.get(),
x_desc.get(),
cd.get(),
y_desc.get(),
&workspace_size);
workspace_shape = shape{shape::int8_type, {workspace_size}};
auto x = to_gpu(generate_argument(inputs[0]->get_shape()));
auto w = to_gpu(generate_argument(inputs[1]->get_shape()));
auto y = to_gpu(generate_argument(output_shape));
auto y = allocate_gpu(output_shape);
auto workspace = allocate_gpu(workspace_shape);
int algo_count = 1;
miopenConvAlgoPerf_t perf;
miopenFindConvolutionForwardAlgorithm(ctx.handle.get(),
miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
x_desc.get(),
x.implicit(),
w_desc.get(),
......
......@@ -5,14 +5,18 @@ namespace migraph {
namespace gpu {
namespace device {
void add(const argument& result, const argument& arg1, const argument& arg2)
void add(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
{
nary(result, arg1, arg2)([](auto x, auto y) { return x + y; });
nary(stream, result, arg1, arg2)([](auto x, auto y) { return x + y; });
}
void add(const argument& result, const argument& arg1, const argument& arg2, const argument& arg3)
void add(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{
nary(result, arg1, arg2, arg3)([](auto x, auto y, auto z) { return x + y + z; });
nary(stream, result, arg1, arg2, arg3)([](auto x, auto y, auto z) { return x + y + z; });
}
} // namespace device
......
......@@ -5,17 +5,22 @@ namespace migraph {
namespace gpu {
namespace device {
void add_relu(const argument& result, const argument& arg1, const argument& arg2)
void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2)
{
nary(result, arg1, arg2)([](auto x, auto y) { return std::max<decltype(x + y)>(0, x + y); });
nary(stream, result, arg1, arg2)(
[](auto x, auto y) { return std::max<decltype(x + y)>(0, x + y); });
}
void add_relu(const argument& result,
void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{
nary(result, arg1, arg2, arg3)(
nary(stream, result, arg1, arg2, arg3)(
[](auto x, auto y, auto z) { return std::max<decltype(x + y + z)>(0, x + y + z); });
}
......
......@@ -8,11 +8,11 @@ namespace migraph {
namespace gpu {
namespace device {
argument concat(const migraph::shape& output_shape,
argument concat(hipStream_t stream,
const migraph::shape& output_shape,
std::vector<migraph::argument> args,
std::vector<std::size_t> offsets)
{
// migraph::argument& result = args.back();
for(std::size_t l = 0; l < args.size() - 1; l++)
{
auto argl = args[l];
......@@ -23,12 +23,11 @@ argument concat(const migraph::shape& output_shape,
const auto* inptr = input.data();
hip_tensor_descriptor<ndim> desc_input(input.get_shape());
hip_tensor_descriptor<ndim> desc_output(output.get_shape());
gs_launch(nelements)(
gs_launch(stream, nelements)(
[=](auto i) { outptr[desc_output.linear(desc_input.multi(i))] = inptr[i]; });
});
});
}
// return result;
return args.back();
}
......
......@@ -6,9 +6,9 @@ namespace migraph {
namespace gpu {
namespace device {
void contiguous(argument result, argument arg)
void contiguous(hipStream_t stream, argument result, argument arg)
{
nary_nonstandard(std::move(result), std::move(arg))([](auto x) { return x; });
nary_nonstandard(stream, std::move(result), std::move(arg))([](auto x) { return x; });
}
} // namespace device
......
......@@ -21,7 +21,7 @@ __global__ void launcher(F f)
f(idx);
}
inline auto launch(std::size_t global, std::size_t local)
inline auto launch(hipStream_t stream, std::size_t global, std::size_t local)
{
return [=](auto f) {
assert(local > 0);
......@@ -29,17 +29,17 @@ inline auto launch(std::size_t global, std::size_t local)
using f_type = decltype(f);
dim3 nblocks(global / local);
dim3 nthreads(local);
hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, nullptr, f);
hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, stream, f);
};
}
inline auto gs_launch(std::size_t n, std::size_t local = 1024)
inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
{
std::size_t groups = 1 + n / local;
std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
return [=](auto f) {
launch(nglobal, local)([=](auto idx) {
launch(stream, nglobal, local)([=](auto idx) {
for(size_t i = idx.global; i < n; i += nglobal)
{
f(i);
......
......@@ -32,7 +32,7 @@ auto pack_vec4(Ts... xs)
}
template <class F, class... Arguments>
auto nary_nonstandard_impl(F f, argument result, Arguments... args)
auto nary_nonstandard_impl(hipStream_t stream, F f, argument result, Arguments... args)
{
const auto& output_shape = result.get_shape();
visit_all(result, args...)([&](auto output, auto... inputs) {
......@@ -41,7 +41,7 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
std::make_pair(hip_tensor_descriptor<ndim>{inputs.get_shape()}, inputs.data())...);
hip_tensor_descriptor<ndim> out_desc(output_shape);
auto* outp = output.data();
gs_launch(output_shape.elements())([=](auto i) {
gs_launch(stream, output_shape.elements())([=](auto i) {
data([&](auto&&... ps) {
auto outidx = out_desc.multi(i);
outp[i] = f(ps.second[ps.first.linear(outidx)]...);
......@@ -52,8 +52,12 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
}
template <class F>
void trinary_broadcast_vec_impl(
F f, const argument& result, const argument& arg1, const argument& arg2, const argument& arg3)
void trinary_broadcast_vec_impl(hipStream_t stream,
F f,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg3.get_shape();
......@@ -79,7 +83,7 @@ void trinary_broadcast_vec_impl(
const std::size_t n = output.size() / vec_size;
const std::size_t bdim_vec_len = bdim_len / vec_size;
launch(nglobal, nlocal)([=](auto idx) __device__ {
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
......@@ -107,8 +111,12 @@ void trinary_broadcast_vec_impl(
}
template <class F>
void trinary_broadcast_impl(
F f, const argument& result, const argument& arg1, const argument& arg2, const argument& arg3)
void trinary_broadcast_impl(hipStream_t stream,
F f,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg3.get_shape();
......@@ -132,7 +140,7 @@ void trinary_broadcast_impl(
const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size();
launch(nglobal, nlocal)([=](auto idx) __device__ {
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED type buffer[2048];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal)
......@@ -154,10 +162,8 @@ void trinary_broadcast_impl(
}
template <class F>
void binary_broadcast_vec_impl(F f,
const argument& result,
const argument& arg1,
const argument& arg2)
void binary_broadcast_vec_impl(
hipStream_t stream, F f, const argument& result, const argument& arg1, const argument& arg2)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg2.get_shape();
......@@ -182,7 +188,7 @@ void binary_broadcast_vec_impl(F f,
const std::size_t n = output.size() / vec_size;
const std::size_t bdim_vec_len = bdim_len / vec_size;
launch(nglobal, nlocal)([=](auto idx) __device__ {
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
......@@ -209,7 +215,8 @@ void binary_broadcast_vec_impl(F f,
}
template <class F>
void binary_broadcast_impl(F f, const argument& result, const argument& arg1, const argument& arg2)
void binary_broadcast_impl(
hipStream_t stream, F f, const argument& result, const argument& arg1, const argument& arg2)
{
const auto& output_shape = result.get_shape();
const auto& b_shape = arg2.get_shape();
......@@ -232,7 +239,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
const std::size_t nglobal = 256 * nlocal;
const std::size_t n = output.size();
launch(nglobal, nlocal)([=](auto idx) __device__ {
launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
MIGRAPH_DEVICE_SHARED type buffer[2048];
// Load bias into LDS
for(size_t i = idx.local; i < bdim_len; i += nlocal)
......@@ -253,7 +260,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
}
template <class F, class... Arguments>
void nary_standard_vec_impl(F f, argument result, Arguments... args)
void nary_standard_vec_impl(hipStream_t stream, F f, argument result, Arguments... args)
{
// assert(x.get_shape().elements() == y.get_shape().elements());
const auto& output_shape = result.get_shape();
......@@ -262,7 +269,7 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
const std::size_t vec_size = 4;
auto data = pack_vec4(inputs.data()...);
auto* outp = as_vec4(output.data());
gs_launch(output_shape.elements() / vec_size)([=](auto i) {
gs_launch(stream, output_shape.elements() / vec_size)([=](auto i) {
vec4<type> out = outp[i];
data(
[&](auto... xs) {
......@@ -278,50 +285,51 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
}
template <class F, class... Arguments>
void nary_standard_impl(F f, argument result, Arguments... args)
void nary_standard_impl(hipStream_t stream, F f, argument result, Arguments... args)
{
// assert(x.get_shape().elements() == y.get_shape().elements());
const auto& output_shape = result.get_shape();
visit_all(result, args...)([&](auto output, auto... inputs) {
auto data = pack(inputs.data()...);
auto* outp = output.data();
gs_launch(output_shape.elements())(
gs_launch(stream, output_shape.elements())(
[=](auto i) { data([&](auto... xps) { outp[i] = f(xps[i]...); }); });
});
}
template <class F, class... Arguments>
void nary_impl(F f, argument result, Arguments... args)
void nary_impl(hipStream_t stream, F f, argument result, Arguments... args)
{
bool standard = all_of({args.get_shape()...}, [](const shape& s) { return s.standard(); });
bool packed = all_of({args.get_shape()...}, [](const shape& s) { return s.packed(); });
bool same_shapes =
all_of({args.get_shape()...}, [&](const shape& s) { return s == result.get_shape(); });
if(standard or (packed and same_shapes))
nary_standard_impl(f, result, args...);
nary_standard_impl(stream, f, result, args...);
else
nary_nonstandard_impl(f, result, args...);
nary_nonstandard_impl(stream, f, result, args...);
}
template <class... Arguments>
auto nary_nonstandard(argument result, Arguments... args)
auto nary_nonstandard(hipStream_t stream, argument result, Arguments... args)
{
return [=](auto f) { nary_nonstandard_impl(f, result, args...); };
return [=](auto f) { nary_nonstandard_impl(stream, f, result, args...); };
}
template <class... Arguments>
auto nary_standard(argument result, Arguments... args)
auto nary_standard(hipStream_t stream, argument result, Arguments... args)
{
return [=](auto f) { nary_standard_impl(f, result, args...); };
return [=](auto f) { nary_standard_impl(stream, f, result, args...); };
}
template <class... Arguments>
auto nary(argument result, Arguments... args)
auto nary(hipStream_t stream, argument result, Arguments... args)
{
return [=](auto f) { nary_impl(f, result, args...); };
return [=](auto f) { nary_impl(stream, f, result, args...); };
}
inline auto nary(const argument& result, const argument& arg1, const argument& arg2)
inline auto
nary(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
{
return [=](auto f) {
// TODO: Check result and arg1 shape is the same
......@@ -339,18 +347,21 @@ inline auto nary(const argument& result, const argument& arg1, const argument& a
const bool divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and
(arg1.get_shape().elements() % 4 == 0);
if(divisible_by_4)
binary_broadcast_vec_impl(f, result, arg1, arg2);
binary_broadcast_vec_impl(stream, f, result, arg1, arg2);
else
binary_broadcast_impl(f, result, arg1, arg2);
binary_broadcast_impl(stream, f, result, arg1, arg2);
return;
}
}
nary_impl(f, result, arg1, arg2);
nary_impl(stream, f, result, arg1, arg2);
};
}
inline auto
nary(const argument& result, const argument& arg1, const argument& arg2, const argument& arg3)
inline auto nary(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3)
{
return [=](auto f) {
// TODO: Check result and arg1 shape is the same
......@@ -369,13 +380,13 @@ nary(const argument& result, const argument& arg1, const argument& arg2, const a
const bool divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and
(arg1.get_shape().elements() % 4 == 0);
if(divisible_by_4)
trinary_broadcast_vec_impl(f, result, arg1, arg2, arg3);
trinary_broadcast_vec_impl(stream, f, result, arg1, arg2, arg3);
else
trinary_broadcast_impl(f, result, arg1, arg2, arg3);
trinary_broadcast_impl(stream, f, result, arg1, arg2, arg3);
return;
}
}
nary_impl(f, result, arg1, arg2, arg3);
nary_impl(stream, f, result, arg1, arg2, arg3);
};
}
......
......@@ -82,13 +82,14 @@ struct fusion
// int algo_count = 1;
// miopenConvFwdAlgorithm_t algo;
// miopenFusionPlanConvolutionGetAlgo(fp.get(), 1, &algo_count, &algo);
// miopenFusionPlanGetWorkSpaceSize(ctx.handle.get(), fp.get(), &ws_size, algo);
// miopenFusionPlanGetWorkSpaceSize(ctx.get_stream().get_miopen(), fp.get(), &ws_size,
// algo);
return shape{shape::int8_type, {ws_size}};
}
void compile(context& ctx)
{
auto status = miopenCompileFusionPlan(ctx.handle.get(), fp.get());
auto status = miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get());
if(status != miopenStatusSuccess)
MIGRAPH_THROW("Compiling fusion plan failed");
}
......@@ -100,7 +101,7 @@ struct fusion
{
auto x_td = make_tensor(x.get_shape());
auto y_td = make_tensor(y.get_shape());
auto status = miopenExecuteFusionPlan(ctx.handle.get(),
auto status = miopenExecuteFusionPlan(ctx.get_stream().get_miopen(),
fp.get(),
x_td.get(),
x.implicit(),
......@@ -152,9 +153,9 @@ struct hip_triadd
check_shapes{inputs, *this}.has(4);
return inputs.front();
}
argument compute(context&, const shape&, const std::vector<argument>& args) const
argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::add(args.at(3), args.at(0), args.at(1), args.at(2));
device::add(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
return args.at(3);
}
};
......@@ -167,9 +168,9 @@ struct hip_triadd_relu
check_shapes{inputs, *this}.has(4);
return inputs.front();
}
argument compute(context&, const shape&, const std::vector<argument>& args) const
argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::add_relu(args.at(3), args.at(0), args.at(1), args.at(2));
device::add_relu(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
return args.at(3);
}
};
......@@ -182,9 +183,9 @@ struct hip_add_relu
check_shapes{inputs, *this}.has(3);
return inputs.front();
}
argument compute(context&, const shape&, const std::vector<argument>& args) const
argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
device::add_relu(args.at(2), args.at(0), args.at(1));
device::add_relu(ctx.get_stream().get(), args.at(2), args.at(0), args.at(1));
return args.at(2);
}
};
......
......@@ -26,7 +26,7 @@ argument miopen_gemm::compute(context& ctx,
rocblas_int m = output_shape.lens()[0];
rocblas_int n = output_shape.lens()[1];
rocblas_int k = args[0].get_shape().lens()[1];
rocblas_sgemm(ctx.rbhandle.get(),
rocblas_sgemm(ctx.get_stream().get_rocblas(),
transb ? rocblas_operation_transpose : rocblas_operation_none,
transa ? rocblas_operation_transpose : rocblas_operation_none,
n,
......
......@@ -38,14 +38,6 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
return hip_ptr{result};
}
template <class T>
hip_ptr write_to_gpu(const T& x)
{
using type = typename T::value_type;
auto size = x.size() * sizeof(type);
return write_to_gpu(x.data(), size);
}
template <class T>
std::vector<T> read_from_gpu(const void* x, std::size_t sz)
{
......@@ -65,6 +57,14 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
return result;
}
template <class T>
hip_ptr write_to_gpu(const T& x)
{
using type = typename T::value_type;
auto size = x.size() * sizeof(type);
return write_to_gpu(x.data(), size);
}
argument allocate_gpu(const shape& s, bool host)
{
auto p = share(allocate_gpu(s.bytes() + 1, host));
......@@ -88,6 +88,13 @@ argument from_gpu(argument arg)
return result;
}
void set_device(std::size_t id)
{
auto status = hipSetDevice(id);
if(status != hipSuccess)
MIGRAPH_THROW("Error setting device");
}
void gpu_sync() { hipDeviceSynchronize(); }
void copy_to_gpu(argument src, argument dst)
......
......@@ -4,17 +4,114 @@
#include <migraph/gpu/miopen.hpp>
#include <migraph/gpu/rocblas.hpp>
#include <migraph/gpu/hip.hpp>
#include <migraph/env.hpp>
namespace migraph {
namespace gpu {
MIGRAPH_DECLARE_ENV_VAR(MIGRAPH_DISABLE_NULL_STREAM)
struct hip_device
{
hip_device() { add_stream(); }
hip_device(std::size_t id) : device_id(id) { add_stream(); }
struct stream
{
using hip_stream_ptr = MIGRAPH_MANAGE_PTR(hipStream_t, hipStreamDestroy);
stream() {}
stream(std::size_t device_number) : id(device_number) {}
void setup() { set_device(id); }
static hip_stream_ptr create_stream()
{
hipStream_t result = nullptr;
auto status = hipStreamCreate(&result);
if(status != hipSuccess)
MIGRAPH_THROW("Failed to allocate stream");
return hip_stream_ptr{result};
}
hipStream_t get()
{
if(enabled(MIGRAPH_DISABLE_NULL_STREAM{}))
{
setup();
if(s == nullptr)
s = create_stream();
assert(s.get() != nullptr);
return s.get();
}
return nullptr;
}
auto create_miopen_handle()
{
if(enabled(MIGRAPH_DISABLE_NULL_STREAM{}))
return make_obj<miopen_handle>(&miopenCreateWithStream, get());
else
return make_obj<miopen_handle>(&miopenCreate);
}
auto get_miopen()
{
setup();
if(mihandle == nullptr)
mihandle = create_miopen_handle();
assert(mihandle.get() != nullptr);
return mihandle.get();
}
auto get_rocblas()
{
setup();
if(rbhandle == nullptr)
rbhandle = create_rocblas_handle_ptr(get());
assert(rbhandle.get() != nullptr);
return rbhandle.get();
}
private:
std::size_t id = 0;
shared<hip_stream_ptr> s = nullptr;
shared<miopen_handle> mihandle = nullptr;
shared<rocblas_handle_ptr> rbhandle = nullptr;
};
void add_stream() { streams.emplace_back(device_id); }
stream& get_stream() { return streams.at(current_stream); }
void set_stream(std::size_t n) { current_stream = n; }
private:
std::size_t device_id = 0;
std::size_t current_stream = 0;
std::vector<stream> streams;
};
struct context
{
shared<miopen_handle> handle;
shared<rocblas_handle_ptr> rbhandle;
argument scratch;
context(std::size_t n = 0) : current_device(std::make_shared<hip_device>(n)) {}
hip_device& get_current_device()
{
assert(current_device != nullptr);
return *current_device;
}
hip_device::stream& get_stream() { return get_current_device().get_stream(); }
std::vector<argument> literals{};
void finish() const { gpu_sync(); }
private:
// TODO: Make this a vector to support multiple devices
std::shared_ptr<hip_device> current_device;
};
} // namespace gpu
} // namespace migraph
......
......@@ -3,14 +3,19 @@
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_HPP
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph {
namespace gpu {
namespace device {
void add(const argument& result, const argument& arg1, const argument& arg2);
void add(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2);
void add(const argument& result, const argument& arg1, const argument& arg2, const argument& arg3);
void add(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3);
} // namespace device
} // namespace gpu
......
......@@ -3,14 +3,19 @@
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph {
namespace gpu {
namespace device {
void add_relu(const argument& result, const argument& arg1, const argument& arg2);
void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2);
void add_relu(const argument& result,
void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3);
......
#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#define MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph {
namespace gpu {
namespace device {
argument
concat(const shape& output_shape, std::vector<argument> args, std::vector<std::size_t> offsets);
argument concat(hipStream_t stream,
const shape& output_shape,
std::vector<argument> args,
std::vector<std::size_t> offsets);
} // namespace device
} // namespace gpu
......
......@@ -2,12 +2,13 @@
#define MIGRAPH_GUARD_MIGRAPHLIB_KERNELS_HPP
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace migraph {
namespace gpu {
namespace device {
void contiguous(argument result, argument arg);
void contiguous(hipStream_t stream, argument result, argument arg);
} // namespace device
} // namespace gpu
......
......@@ -13,6 +13,8 @@ migraph::argument to_gpu(migraph::argument arg, bool host = false);
migraph::argument from_gpu(migraph::argument arg);
void set_device(std::size_t id);
void gpu_sync();
void copy_to_gpu(argument src, argument dst);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment