Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
394a7495
Unverified
Commit
394a7495
authored
Oct 29, 2018
by
Paul Fultz II
Committed by
GitHub
Oct 29, 2018
Browse files
Merge branch 'master' into renameGemmToDot
parents
a3057ebf
30371001
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
246 additions
and
98 deletions
+246
-98
src/targets/gpu/add.cpp
src/targets/gpu/add.cpp
+3
-3
src/targets/gpu/batchnorm.cpp
src/targets/gpu/batchnorm.cpp
+1
-1
src/targets/gpu/concat.cpp
src/targets/gpu/concat.cpp
+4
-3
src/targets/gpu/contiguous.cpp
src/targets/gpu/contiguous.cpp
+4
-3
src/targets/gpu/convolution.cpp
src/targets/gpu/convolution.cpp
+9
-5
src/targets/gpu/device/add.cpp
src/targets/gpu/device/add.cpp
+8
-4
src/targets/gpu/device/add_relu.cpp
src/targets/gpu/device/add_relu.cpp
+9
-4
src/targets/gpu/device/concat.cpp
src/targets/gpu/device/concat.cpp
+3
-4
src/targets/gpu/device/contiguous.cpp
src/targets/gpu/device/contiguous.cpp
+2
-2
src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
+4
-4
src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
+48
-37
src/targets/gpu/fuse_ops.cpp
src/targets/gpu/fuse_ops.cpp
+10
-9
src/targets/gpu/gemm.cpp
src/targets/gpu/gemm.cpp
+1
-1
src/targets/gpu/hip.cpp
src/targets/gpu/hip.cpp
+15
-8
src/targets/gpu/include/migraph/gpu/context.hpp
src/targets/gpu/include/migraph/gpu/context.hpp
+100
-3
src/targets/gpu/include/migraph/gpu/device/add.hpp
src/targets/gpu/include/migraph/gpu/device/add.hpp
+7
-2
src/targets/gpu/include/migraph/gpu/device/add_relu.hpp
src/targets/gpu/include/migraph/gpu/device/add_relu.hpp
+7
-2
src/targets/gpu/include/migraph/gpu/device/concat.hpp
src/targets/gpu/include/migraph/gpu/device/concat.hpp
+7
-2
src/targets/gpu/include/migraph/gpu/device/contiguous.hpp
src/targets/gpu/include/migraph/gpu/device/contiguous.hpp
+2
-1
src/targets/gpu/include/migraph/gpu/hip.hpp
src/targets/gpu/include/migraph/gpu/hip.hpp
+2
-0
No files found.
src/targets/gpu/add.cpp
View file @
394a7495
...
@@ -14,9 +14,9 @@ shape hip_add::compute_shape(const std::vector<shape>& inputs) const
...
@@ -14,9 +14,9 @@ shape hip_add::compute_shape(const std::vector<shape>& inputs) const
return
inputs
.
at
(
0
);
return
inputs
.
at
(
0
);
}
}
argument
hip_add
::
compute
(
context
&
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
argument
hip_add
::
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
{
device
::
add
(
args
[
2
],
args
[
0
],
args
[
1
]);
device
::
add
(
ctx
.
get_stream
().
get
(),
args
[
2
],
args
[
0
],
args
[
1
]);
return
args
[
2
];
return
args
[
2
];
}
}
...
@@ -34,7 +34,7 @@ argument miopen_add::compute(context& ctx,
...
@@ -34,7 +34,7 @@ argument miopen_add::compute(context& ctx,
auto
a_desc
=
make_tensor
(
args
[
0
].
get_shape
());
auto
a_desc
=
make_tensor
(
args
[
0
].
get_shape
());
auto
b_desc
=
make_tensor
(
args
[
1
].
get_shape
());
auto
b_desc
=
make_tensor
(
args
[
1
].
get_shape
());
auto
c_desc
=
make_tensor
(
output_shape
);
auto
c_desc
=
make_tensor
(
output_shape
);
miopenOpTensor
(
ctx
.
handle
.
get
(),
miopenOpTensor
(
ctx
.
get_stream
().
get_miopen
(),
miopenTensorOpAdd
,
miopenTensorOpAdd
,
&
alpha
,
&
alpha
,
a_desc
.
get
(),
a_desc
.
get
(),
...
...
src/targets/gpu/batchnorm.cpp
View file @
394a7495
...
@@ -23,7 +23,7 @@ argument miopen_batch_norm_inference::compute(context& ctx,
...
@@ -23,7 +23,7 @@ argument miopen_batch_norm_inference::compute(context& ctx,
float
alpha
=
1.0
,
beta
=
0.0
f
;
float
alpha
=
1.0
,
beta
=
0.0
f
;
miopenBatchNormalizationForwardInference
(
ctx
.
handle
.
get
(),
miopenBatchNormalizationForwardInference
(
ctx
.
get_stream
().
get_miopen
(),
miopenBatchNormMode_t
(
op
.
bn_mode
),
miopenBatchNormMode_t
(
op
.
bn_mode
),
&
alpha
,
&
alpha
,
&
beta
,
&
beta
,
...
...
src/targets/gpu/concat.cpp
View file @
394a7495
...
@@ -14,11 +14,12 @@ shape hip_concat::compute_shape(std::vector<shape> inputs) const
...
@@ -14,11 +14,12 @@ shape hip_concat::compute_shape(std::vector<shape> inputs) const
return
op
.
compute_shape
(
inputs
);
return
op
.
compute_shape
(
inputs
);
}
}
argument
argument
hip_concat
::
compute
(
context
&
ctx
,
hip_concat
::
compute
(
context
&
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
{
{
std
::
vector
<
std
::
size_t
>
offsets
=
op
.
compute_offsets
(
output_shape
,
args
);
std
::
vector
<
std
::
size_t
>
offsets
=
op
.
compute_offsets
(
output_shape
,
args
);
return
device
::
concat
(
output_shape
,
args
,
offsets
);
return
device
::
concat
(
ctx
.
get_stream
().
get
(),
output_shape
,
args
,
offsets
);
}
}
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/contiguous.cpp
View file @
394a7495
...
@@ -12,13 +12,14 @@ shape miopen_contiguous::compute_shape(const std::vector<shape>& inputs) const
...
@@ -12,13 +12,14 @@ shape miopen_contiguous::compute_shape(const std::vector<shape>& inputs) const
check_shapes
{
inputs
,
*
this
}.
has
(
2
);
check_shapes
{
inputs
,
*
this
}.
has
(
2
);
return
op
.
compute_shape
({
inputs
.
at
(
0
)});
return
op
.
compute_shape
({
inputs
.
at
(
0
)});
}
}
argument
argument
miopen_contiguous
::
compute
(
context
&
ctx
,
miopen_contiguous
::
compute
(
context
&
,
shape
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
shape
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
{
{
assert
(
output_shape
==
args
[
1
].
get_shape
());
assert
(
output_shape
==
args
[
1
].
get_shape
());
assert
(
output_shape
.
standard
());
assert
(
output_shape
.
standard
());
(
void
)
output_shape
;
(
void
)
output_shape
;
device
::
contiguous
(
args
.
at
(
1
),
args
.
at
(
0
));
device
::
contiguous
(
ctx
.
get_stream
().
get
(),
args
.
at
(
1
),
args
.
at
(
0
));
return
args
.
at
(
1
);
return
args
.
at
(
1
);
}
}
...
...
src/targets/gpu/convolution.cpp
View file @
394a7495
...
@@ -21,7 +21,7 @@ argument miopen_convolution::compute(context& ctx,
...
@@ -21,7 +21,7 @@ argument miopen_convolution::compute(context& ctx,
auto
y_desc
=
make_tensor
(
output_shape
);
auto
y_desc
=
make_tensor
(
output_shape
);
float
alpha
=
1
,
beta
=
0
;
float
alpha
=
1
,
beta
=
0
;
miopenConvolutionForward
(
ctx
.
handle
.
get
(),
miopenConvolutionForward
(
ctx
.
get_stream
().
get_miopen
(),
&
alpha
,
&
alpha
,
x_desc
.
get
(),
x_desc
.
get
(),
args
[
0
].
implicit
(),
args
[
0
].
implicit
(),
...
@@ -47,18 +47,22 @@ shape miopen_convolution::compile(context& ctx,
...
@@ -47,18 +47,22 @@ shape miopen_convolution::compile(context& ctx,
auto
y_desc
=
make_tensor
(
output_shape
);
auto
y_desc
=
make_tensor
(
output_shape
);
std
::
size_t
workspace_size
=
0
;
std
::
size_t
workspace_size
=
0
;
miopenConvolutionForwardGetWorkSpaceSize
(
miopenConvolutionForwardGetWorkSpaceSize
(
ctx
.
get_stream
().
get_miopen
(),
ctx
.
handle
.
get
(),
w_desc
.
get
(),
x_desc
.
get
(),
cd
.
get
(),
y_desc
.
get
(),
&
workspace_size
);
w_desc
.
get
(),
x_desc
.
get
(),
cd
.
get
(),
y_desc
.
get
(),
&
workspace_size
);
workspace_shape
=
shape
{
shape
::
int8_type
,
{
workspace_size
}};
workspace_shape
=
shape
{
shape
::
int8_type
,
{
workspace_size
}};
auto
x
=
to_gpu
(
generate_argument
(
inputs
[
0
]
->
get_shape
()));
auto
x
=
to_gpu
(
generate_argument
(
inputs
[
0
]
->
get_shape
()));
auto
w
=
to_gpu
(
generate_argument
(
inputs
[
1
]
->
get_shape
()));
auto
w
=
to_gpu
(
generate_argument
(
inputs
[
1
]
->
get_shape
()));
auto
y
=
to_gpu
(
generate_argument
(
output_shape
)
)
;
auto
y
=
allocate_gpu
(
output_shape
);
auto
workspace
=
allocate_gpu
(
workspace_shape
);
auto
workspace
=
allocate_gpu
(
workspace_shape
);
int
algo_count
=
1
;
int
algo_count
=
1
;
miopenConvAlgoPerf_t
perf
;
miopenConvAlgoPerf_t
perf
;
miopenFindConvolutionForwardAlgorithm
(
ctx
.
handle
.
get
(),
miopenFindConvolutionForwardAlgorithm
(
ctx
.
get_stream
().
get_miopen
(),
x_desc
.
get
(),
x_desc
.
get
(),
x
.
implicit
(),
x
.
implicit
(),
w_desc
.
get
(),
w_desc
.
get
(),
...
...
src/targets/gpu/device/add.cpp
View file @
394a7495
...
@@ -5,14 +5,18 @@ namespace migraph {
...
@@ -5,14 +5,18 @@ namespace migraph {
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
void
add
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
void
add
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
{
{
nary
(
result
,
arg1
,
arg2
)([](
auto
x
,
auto
y
)
{
return
x
+
y
;
});
nary
(
stream
,
result
,
arg1
,
arg2
)([](
auto
x
,
auto
y
)
{
return
x
+
y
;
});
}
}
void
add
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
void
add
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
{
{
nary
(
result
,
arg1
,
arg2
,
arg3
)([](
auto
x
,
auto
y
,
auto
z
)
{
return
x
+
y
+
z
;
});
nary
(
stream
,
result
,
arg1
,
arg2
,
arg3
)([](
auto
x
,
auto
y
,
auto
z
)
{
return
x
+
y
+
z
;
});
}
}
}
// namespace device
}
// namespace device
...
...
src/targets/gpu/device/add_relu.cpp
View file @
394a7495
...
@@ -5,17 +5,22 @@ namespace migraph {
...
@@ -5,17 +5,22 @@ namespace migraph {
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
void
add_relu
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
void
add_relu
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
{
{
nary
(
result
,
arg1
,
arg2
)([](
auto
x
,
auto
y
)
{
return
std
::
max
<
decltype
(
x
+
y
)
>
(
0
,
x
+
y
);
});
nary
(
stream
,
result
,
arg1
,
arg2
)(
[](
auto
x
,
auto
y
)
{
return
std
::
max
<
decltype
(
x
+
y
)
>
(
0
,
x
+
y
);
});
}
}
void
add_relu
(
const
argument
&
result
,
void
add_relu
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg2
,
const
argument
&
arg3
)
const
argument
&
arg3
)
{
{
nary
(
result
,
arg1
,
arg2
,
arg3
)(
nary
(
stream
,
result
,
arg1
,
arg2
,
arg3
)(
[](
auto
x
,
auto
y
,
auto
z
)
{
return
std
::
max
<
decltype
(
x
+
y
+
z
)
>
(
0
,
x
+
y
+
z
);
});
[](
auto
x
,
auto
y
,
auto
z
)
{
return
std
::
max
<
decltype
(
x
+
y
+
z
)
>
(
0
,
x
+
y
+
z
);
});
}
}
...
...
src/targets/gpu/device/concat.cpp
View file @
394a7495
...
@@ -8,11 +8,11 @@ namespace migraph {
...
@@ -8,11 +8,11 @@ namespace migraph {
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
argument
concat
(
const
migraph
::
shape
&
output_shape
,
argument
concat
(
hipStream_t
stream
,
const
migraph
::
shape
&
output_shape
,
std
::
vector
<
migraph
::
argument
>
args
,
std
::
vector
<
migraph
::
argument
>
args
,
std
::
vector
<
std
::
size_t
>
offsets
)
std
::
vector
<
std
::
size_t
>
offsets
)
{
{
// migraph::argument& result = args.back();
for
(
std
::
size_t
l
=
0
;
l
<
args
.
size
()
-
1
;
l
++
)
for
(
std
::
size_t
l
=
0
;
l
<
args
.
size
()
-
1
;
l
++
)
{
{
auto
argl
=
args
[
l
];
auto
argl
=
args
[
l
];
...
@@ -23,12 +23,11 @@ argument concat(const migraph::shape& output_shape,
...
@@ -23,12 +23,11 @@ argument concat(const migraph::shape& output_shape,
const
auto
*
inptr
=
input
.
data
();
const
auto
*
inptr
=
input
.
data
();
hip_tensor_descriptor
<
ndim
>
desc_input
(
input
.
get_shape
());
hip_tensor_descriptor
<
ndim
>
desc_input
(
input
.
get_shape
());
hip_tensor_descriptor
<
ndim
>
desc_output
(
output
.
get_shape
());
hip_tensor_descriptor
<
ndim
>
desc_output
(
output
.
get_shape
());
gs_launch
(
nelements
)(
gs_launch
(
stream
,
nelements
)(
[
=
](
auto
i
)
{
outptr
[
desc_output
.
linear
(
desc_input
.
multi
(
i
))]
=
inptr
[
i
];
});
[
=
](
auto
i
)
{
outptr
[
desc_output
.
linear
(
desc_input
.
multi
(
i
))]
=
inptr
[
i
];
});
});
});
});
});
}
}
// return result;
return
args
.
back
();
return
args
.
back
();
}
}
...
...
src/targets/gpu/device/contiguous.cpp
View file @
394a7495
...
@@ -6,9 +6,9 @@ namespace migraph {
...
@@ -6,9 +6,9 @@ namespace migraph {
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
void
contiguous
(
argument
result
,
argument
arg
)
void
contiguous
(
hipStream_t
stream
,
argument
result
,
argument
arg
)
{
{
nary_nonstandard
(
std
::
move
(
result
),
std
::
move
(
arg
))([](
auto
x
)
{
return
x
;
});
nary_nonstandard
(
stream
,
std
::
move
(
result
),
std
::
move
(
arg
))([](
auto
x
)
{
return
x
;
});
}
}
}
// namespace device
}
// namespace device
...
...
src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
View file @
394a7495
...
@@ -21,7 +21,7 @@ __global__ void launcher(F f)
...
@@ -21,7 +21,7 @@ __global__ void launcher(F f)
f
(
idx
);
f
(
idx
);
}
}
inline
auto
launch
(
std
::
size_t
global
,
std
::
size_t
local
)
inline
auto
launch
(
hipStream_t
stream
,
std
::
size_t
global
,
std
::
size_t
local
)
{
{
return
[
=
](
auto
f
)
{
return
[
=
](
auto
f
)
{
assert
(
local
>
0
);
assert
(
local
>
0
);
...
@@ -29,17 +29,17 @@ inline auto launch(std::size_t global, std::size_t local)
...
@@ -29,17 +29,17 @@ inline auto launch(std::size_t global, std::size_t local)
using
f_type
=
decltype
(
f
);
using
f_type
=
decltype
(
f
);
dim3
nblocks
(
global
/
local
);
dim3
nblocks
(
global
/
local
);
dim3
nthreads
(
local
);
dim3
nthreads
(
local
);
hipLaunchKernelGGL
((
launcher
<
f_type
>
),
nblocks
,
nthreads
,
0
,
nullptr
,
f
);
hipLaunchKernelGGL
((
launcher
<
f_type
>
),
nblocks
,
nthreads
,
0
,
stream
,
f
);
};
};
}
}
inline
auto
gs_launch
(
std
::
size_t
n
,
std
::
size_t
local
=
1024
)
inline
auto
gs_launch
(
hipStream_t
stream
,
std
::
size_t
n
,
std
::
size_t
local
=
1024
)
{
{
std
::
size_t
groups
=
1
+
n
/
local
;
std
::
size_t
groups
=
1
+
n
/
local
;
std
::
size_t
nglobal
=
std
::
min
<
std
::
size_t
>
(
256
,
groups
)
*
local
;
std
::
size_t
nglobal
=
std
::
min
<
std
::
size_t
>
(
256
,
groups
)
*
local
;
return
[
=
](
auto
f
)
{
return
[
=
](
auto
f
)
{
launch
(
nglobal
,
local
)([
=
](
auto
idx
)
{
launch
(
stream
,
nglobal
,
local
)([
=
](
auto
idx
)
{
for
(
size_t
i
=
idx
.
global
;
i
<
n
;
i
+=
nglobal
)
for
(
size_t
i
=
idx
.
global
;
i
<
n
;
i
+=
nglobal
)
{
{
f
(
i
);
f
(
i
);
...
...
src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
View file @
394a7495
...
@@ -32,7 +32,7 @@ auto pack_vec4(Ts... xs)
...
@@ -32,7 +32,7 @@ auto pack_vec4(Ts... xs)
}
}
template
<
class
F
,
class
...
Arguments
>
template
<
class
F
,
class
...
Arguments
>
auto
nary_nonstandard_impl
(
F
f
,
argument
result
,
Arguments
...
args
)
auto
nary_nonstandard_impl
(
hipStream_t
stream
,
F
f
,
argument
result
,
Arguments
...
args
)
{
{
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
output_shape
=
result
.
get_shape
();
visit_all
(
result
,
args
...)([
&
](
auto
output
,
auto
...
inputs
)
{
visit_all
(
result
,
args
...)([
&
](
auto
output
,
auto
...
inputs
)
{
...
@@ -41,7 +41,7 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
...
@@ -41,7 +41,7 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
std
::
make_pair
(
hip_tensor_descriptor
<
ndim
>
{
inputs
.
get_shape
()},
inputs
.
data
())...);
std
::
make_pair
(
hip_tensor_descriptor
<
ndim
>
{
inputs
.
get_shape
()},
inputs
.
data
())...);
hip_tensor_descriptor
<
ndim
>
out_desc
(
output_shape
);
hip_tensor_descriptor
<
ndim
>
out_desc
(
output_shape
);
auto
*
outp
=
output
.
data
();
auto
*
outp
=
output
.
data
();
gs_launch
(
output_shape
.
elements
())([
=
](
auto
i
)
{
gs_launch
(
stream
,
output_shape
.
elements
())([
=
](
auto
i
)
{
data
([
&
](
auto
&&
...
ps
)
{
data
([
&
](
auto
&&
...
ps
)
{
auto
outidx
=
out_desc
.
multi
(
i
);
auto
outidx
=
out_desc
.
multi
(
i
);
outp
[
i
]
=
f
(
ps
.
second
[
ps
.
first
.
linear
(
outidx
)]...);
outp
[
i
]
=
f
(
ps
.
second
[
ps
.
first
.
linear
(
outidx
)]...);
...
@@ -52,8 +52,12 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
...
@@ -52,8 +52,12 @@ auto nary_nonstandard_impl(F f, argument result, Arguments... args)
}
}
template
<
class
F
>
template
<
class
F
>
void
trinary_broadcast_vec_impl
(
void
trinary_broadcast_vec_impl
(
hipStream_t
stream
,
F
f
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
F
f
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
{
{
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
b_shape
=
arg3
.
get_shape
();
const
auto
&
b_shape
=
arg3
.
get_shape
();
...
@@ -79,7 +83,7 @@ void trinary_broadcast_vec_impl(
...
@@ -79,7 +83,7 @@ void trinary_broadcast_vec_impl(
const
std
::
size_t
n
=
output
.
size
()
/
vec_size
;
const
std
::
size_t
n
=
output
.
size
()
/
vec_size
;
const
std
::
size_t
bdim_vec_len
=
bdim_len
/
vec_size
;
const
std
::
size_t
bdim_vec_len
=
bdim_len
/
vec_size
;
launch
(
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPH_DEVICE_SHARED
vec4
<
type
>
buffer
[
2048
/
vec_size
];
MIGRAPH_DEVICE_SHARED
vec4
<
type
>
buffer
[
2048
/
vec_size
];
// Load bias into LDS
// Load bias into LDS
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
...
@@ -107,8 +111,12 @@ void trinary_broadcast_vec_impl(
...
@@ -107,8 +111,12 @@ void trinary_broadcast_vec_impl(
}
}
template
<
class
F
>
template
<
class
F
>
void
trinary_broadcast_impl
(
void
trinary_broadcast_impl
(
hipStream_t
stream
,
F
f
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
F
f
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
{
{
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
b_shape
=
arg3
.
get_shape
();
const
auto
&
b_shape
=
arg3
.
get_shape
();
...
@@ -132,7 +140,7 @@ void trinary_broadcast_impl(
...
@@ -132,7 +140,7 @@ void trinary_broadcast_impl(
const
std
::
size_t
nglobal
=
256
*
nlocal
;
const
std
::
size_t
nglobal
=
256
*
nlocal
;
const
std
::
size_t
n
=
output
.
size
();
const
std
::
size_t
n
=
output
.
size
();
launch
(
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPH_DEVICE_SHARED
type
buffer
[
2048
];
MIGRAPH_DEVICE_SHARED
type
buffer
[
2048
];
// Load bias into LDS
// Load bias into LDS
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
...
@@ -154,10 +162,8 @@ void trinary_broadcast_impl(
...
@@ -154,10 +162,8 @@ void trinary_broadcast_impl(
}
}
template
<
class
F
>
template
<
class
F
>
void
binary_broadcast_vec_impl
(
F
f
,
void
binary_broadcast_vec_impl
(
const
argument
&
result
,
hipStream_t
stream
,
F
f
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
const
argument
&
arg1
,
const
argument
&
arg2
)
{
{
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
b_shape
=
arg2
.
get_shape
();
const
auto
&
b_shape
=
arg2
.
get_shape
();
...
@@ -182,7 +188,7 @@ void binary_broadcast_vec_impl(F f,
...
@@ -182,7 +188,7 @@ void binary_broadcast_vec_impl(F f,
const
std
::
size_t
n
=
output
.
size
()
/
vec_size
;
const
std
::
size_t
n
=
output
.
size
()
/
vec_size
;
const
std
::
size_t
bdim_vec_len
=
bdim_len
/
vec_size
;
const
std
::
size_t
bdim_vec_len
=
bdim_len
/
vec_size
;
launch
(
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPH_DEVICE_SHARED
vec4
<
type
>
buffer
[
2048
/
vec_size
];
MIGRAPH_DEVICE_SHARED
vec4
<
type
>
buffer
[
2048
/
vec_size
];
// Load bias into LDS
// Load bias into LDS
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_vec_len
;
i
+=
nlocal
)
...
@@ -209,7 +215,8 @@ void binary_broadcast_vec_impl(F f,
...
@@ -209,7 +215,8 @@ void binary_broadcast_vec_impl(F f,
}
}
template
<
class
F
>
template
<
class
F
>
void
binary_broadcast_impl
(
F
f
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
void
binary_broadcast_impl
(
hipStream_t
stream
,
F
f
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
{
{
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
b_shape
=
arg2
.
get_shape
();
const
auto
&
b_shape
=
arg2
.
get_shape
();
...
@@ -232,7 +239,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
...
@@ -232,7 +239,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
const
std
::
size_t
nglobal
=
256
*
nlocal
;
const
std
::
size_t
nglobal
=
256
*
nlocal
;
const
std
::
size_t
n
=
output
.
size
();
const
std
::
size_t
n
=
output
.
size
();
launch
(
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
launch
(
stream
,
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
MIGRAPH_DEVICE_SHARED
type
buffer
[
2048
];
MIGRAPH_DEVICE_SHARED
type
buffer
[
2048
];
// Load bias into LDS
// Load bias into LDS
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
for
(
size_t
i
=
idx
.
local
;
i
<
bdim_len
;
i
+=
nlocal
)
...
@@ -253,7 +260,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
...
@@ -253,7 +260,7 @@ void binary_broadcast_impl(F f, const argument& result, const argument& arg1, co
}
}
template
<
class
F
,
class
...
Arguments
>
template
<
class
F
,
class
...
Arguments
>
void
nary_standard_vec_impl
(
F
f
,
argument
result
,
Arguments
...
args
)
void
nary_standard_vec_impl
(
hipStream_t
stream
,
F
f
,
argument
result
,
Arguments
...
args
)
{
{
// assert(x.get_shape().elements() == y.get_shape().elements());
// assert(x.get_shape().elements() == y.get_shape().elements());
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
output_shape
=
result
.
get_shape
();
...
@@ -262,7 +269,7 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
...
@@ -262,7 +269,7 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
const
std
::
size_t
vec_size
=
4
;
const
std
::
size_t
vec_size
=
4
;
auto
data
=
pack_vec4
(
inputs
.
data
()...);
auto
data
=
pack_vec4
(
inputs
.
data
()...);
auto
*
outp
=
as_vec4
(
output
.
data
());
auto
*
outp
=
as_vec4
(
output
.
data
());
gs_launch
(
output_shape
.
elements
()
/
vec_size
)([
=
](
auto
i
)
{
gs_launch
(
stream
,
output_shape
.
elements
()
/
vec_size
)([
=
](
auto
i
)
{
vec4
<
type
>
out
=
outp
[
i
];
vec4
<
type
>
out
=
outp
[
i
];
data
(
data
(
[
&
](
auto
...
xs
)
{
[
&
](
auto
...
xs
)
{
...
@@ -278,50 +285,51 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
...
@@ -278,50 +285,51 @@ void nary_standard_vec_impl(F f, argument result, Arguments... args)
}
}
template
<
class
F
,
class
...
Arguments
>
template
<
class
F
,
class
...
Arguments
>
void
nary_standard_impl
(
F
f
,
argument
result
,
Arguments
...
args
)
void
nary_standard_impl
(
hipStream_t
stream
,
F
f
,
argument
result
,
Arguments
...
args
)
{
{
// assert(x.get_shape().elements() == y.get_shape().elements());
// assert(x.get_shape().elements() == y.get_shape().elements());
const
auto
&
output_shape
=
result
.
get_shape
();
const
auto
&
output_shape
=
result
.
get_shape
();
visit_all
(
result
,
args
...)([
&
](
auto
output
,
auto
...
inputs
)
{
visit_all
(
result
,
args
...)([
&
](
auto
output
,
auto
...
inputs
)
{
auto
data
=
pack
(
inputs
.
data
()...);
auto
data
=
pack
(
inputs
.
data
()...);
auto
*
outp
=
output
.
data
();
auto
*
outp
=
output
.
data
();
gs_launch
(
output_shape
.
elements
())(
gs_launch
(
stream
,
output_shape
.
elements
())(
[
=
](
auto
i
)
{
data
([
&
](
auto
...
xps
)
{
outp
[
i
]
=
f
(
xps
[
i
]...);
});
});
[
=
](
auto
i
)
{
data
([
&
](
auto
...
xps
)
{
outp
[
i
]
=
f
(
xps
[
i
]...);
});
});
});
});
}
}
template
<
class
F
,
class
...
Arguments
>
template
<
class
F
,
class
...
Arguments
>
void
nary_impl
(
F
f
,
argument
result
,
Arguments
...
args
)
void
nary_impl
(
hipStream_t
stream
,
F
f
,
argument
result
,
Arguments
...
args
)
{
{
bool
standard
=
all_of
({
args
.
get_shape
()...},
[](
const
shape
&
s
)
{
return
s
.
standard
();
});
bool
standard
=
all_of
({
args
.
get_shape
()...},
[](
const
shape
&
s
)
{
return
s
.
standard
();
});
bool
packed
=
all_of
({
args
.
get_shape
()...},
[](
const
shape
&
s
)
{
return
s
.
packed
();
});
bool
packed
=
all_of
({
args
.
get_shape
()...},
[](
const
shape
&
s
)
{
return
s
.
packed
();
});
bool
same_shapes
=
bool
same_shapes
=
all_of
({
args
.
get_shape
()...},
[
&
](
const
shape
&
s
)
{
return
s
==
result
.
get_shape
();
});
all_of
({
args
.
get_shape
()...},
[
&
](
const
shape
&
s
)
{
return
s
==
result
.
get_shape
();
});
if
(
standard
or
(
packed
and
same_shapes
))
if
(
standard
or
(
packed
and
same_shapes
))
nary_standard_impl
(
f
,
result
,
args
...);
nary_standard_impl
(
stream
,
f
,
result
,
args
...);
else
else
nary_nonstandard_impl
(
f
,
result
,
args
...);
nary_nonstandard_impl
(
stream
,
f
,
result
,
args
...);
}
}
template
<
class
...
Arguments
>
template
<
class
...
Arguments
>
auto
nary_nonstandard
(
argument
result
,
Arguments
...
args
)
auto
nary_nonstandard
(
hipStream_t
stream
,
argument
result
,
Arguments
...
args
)
{
{
return
[
=
](
auto
f
)
{
nary_nonstandard_impl
(
f
,
result
,
args
...);
};
return
[
=
](
auto
f
)
{
nary_nonstandard_impl
(
stream
,
f
,
result
,
args
...);
};
}
}
template
<
class
...
Arguments
>
template
<
class
...
Arguments
>
auto
nary_standard
(
argument
result
,
Arguments
...
args
)
auto
nary_standard
(
hipStream_t
stream
,
argument
result
,
Arguments
...
args
)
{
{
return
[
=
](
auto
f
)
{
nary_standard_impl
(
f
,
result
,
args
...);
};
return
[
=
](
auto
f
)
{
nary_standard_impl
(
stream
,
f
,
result
,
args
...);
};
}
}
template
<
class
...
Arguments
>
template
<
class
...
Arguments
>
auto
nary
(
argument
result
,
Arguments
...
args
)
auto
nary
(
hipStream_t
stream
,
argument
result
,
Arguments
...
args
)
{
{
return
[
=
](
auto
f
)
{
nary_impl
(
f
,
result
,
args
...);
};
return
[
=
](
auto
f
)
{
nary_impl
(
stream
,
f
,
result
,
args
...);
};
}
}
inline
auto
nary
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
inline
auto
nary
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
{
{
return
[
=
](
auto
f
)
{
return
[
=
](
auto
f
)
{
// TODO: Check result and arg1 shape is the same
// TODO: Check result and arg1 shape is the same
...
@@ -339,18 +347,21 @@ inline auto nary(const argument& result, const argument& arg1, const argument& a
...
@@ -339,18 +347,21 @@ inline auto nary(const argument& result, const argument& arg1, const argument& a
const
bool
divisible_by_4
=
(
b_len
%
4
==
0
)
and
(
b_stride
%
4
==
0
)
and
const
bool
divisible_by_4
=
(
b_len
%
4
==
0
)
and
(
b_stride
%
4
==
0
)
and
(
arg1
.
get_shape
().
elements
()
%
4
==
0
);
(
arg1
.
get_shape
().
elements
()
%
4
==
0
);
if
(
divisible_by_4
)
if
(
divisible_by_4
)
binary_broadcast_vec_impl
(
f
,
result
,
arg1
,
arg2
);
binary_broadcast_vec_impl
(
stream
,
f
,
result
,
arg1
,
arg2
);
else
else
binary_broadcast_impl
(
f
,
result
,
arg1
,
arg2
);
binary_broadcast_impl
(
stream
,
f
,
result
,
arg1
,
arg2
);
return
;
return
;
}
}
}
}
nary_impl
(
f
,
result
,
arg1
,
arg2
);
nary_impl
(
stream
,
f
,
result
,
arg1
,
arg2
);
};
};
}
}
inline
auto
inline
auto
nary
(
hipStream_t
stream
,
nary
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
{
{
return
[
=
](
auto
f
)
{
return
[
=
](
auto
f
)
{
// TODO: Check result and arg1 shape is the same
// TODO: Check result and arg1 shape is the same
...
@@ -369,13 +380,13 @@ nary(const argument& result, const argument& arg1, const argument& arg2, const a
...
@@ -369,13 +380,13 @@ nary(const argument& result, const argument& arg1, const argument& arg2, const a
const
bool
divisible_by_4
=
(
b_len
%
4
==
0
)
and
(
b_stride
%
4
==
0
)
and
const
bool
divisible_by_4
=
(
b_len
%
4
==
0
)
and
(
b_stride
%
4
==
0
)
and
(
arg1
.
get_shape
().
elements
()
%
4
==
0
);
(
arg1
.
get_shape
().
elements
()
%
4
==
0
);
if
(
divisible_by_4
)
if
(
divisible_by_4
)
trinary_broadcast_vec_impl
(
f
,
result
,
arg1
,
arg2
,
arg3
);
trinary_broadcast_vec_impl
(
stream
,
f
,
result
,
arg1
,
arg2
,
arg3
);
else
else
trinary_broadcast_impl
(
f
,
result
,
arg1
,
arg2
,
arg3
);
trinary_broadcast_impl
(
stream
,
f
,
result
,
arg1
,
arg2
,
arg3
);
return
;
return
;
}
}
}
}
nary_impl
(
f
,
result
,
arg1
,
arg2
,
arg3
);
nary_impl
(
stream
,
f
,
result
,
arg1
,
arg2
,
arg3
);
};
};
}
}
...
...
src/targets/gpu/fuse_ops.cpp
View file @
394a7495
...
@@ -82,13 +82,14 @@ struct fusion
...
@@ -82,13 +82,14 @@ struct fusion
// int algo_count = 1;
// int algo_count = 1;
// miopenConvFwdAlgorithm_t algo;
// miopenConvFwdAlgorithm_t algo;
// miopenFusionPlanConvolutionGetAlgo(fp.get(), 1, &algo_count, &algo);
// miopenFusionPlanConvolutionGetAlgo(fp.get(), 1, &algo_count, &algo);
// miopenFusionPlanGetWorkSpaceSize(ctx.handle.get(), fp.get(), &ws_size, algo);
// miopenFusionPlanGetWorkSpaceSize(ctx.get_stream().get_miopen(), fp.get(), &ws_size,
// algo);
return
shape
{
shape
::
int8_type
,
{
ws_size
}};
return
shape
{
shape
::
int8_type
,
{
ws_size
}};
}
}
void
compile
(
context
&
ctx
)
void
compile
(
context
&
ctx
)
{
{
auto
status
=
miopenCompileFusionPlan
(
ctx
.
handle
.
get
(),
fp
.
get
());
auto
status
=
miopenCompileFusionPlan
(
ctx
.
get_stream
().
get_miopen
(),
fp
.
get
());
if
(
status
!=
miopenStatusSuccess
)
if
(
status
!=
miopenStatusSuccess
)
MIGRAPH_THROW
(
"Compiling fusion plan failed"
);
MIGRAPH_THROW
(
"Compiling fusion plan failed"
);
}
}
...
@@ -100,7 +101,7 @@ struct fusion
...
@@ -100,7 +101,7 @@ struct fusion
{
{
auto
x_td
=
make_tensor
(
x
.
get_shape
());
auto
x_td
=
make_tensor
(
x
.
get_shape
());
auto
y_td
=
make_tensor
(
y
.
get_shape
());
auto
y_td
=
make_tensor
(
y
.
get_shape
());
auto
status
=
miopenExecuteFusionPlan
(
ctx
.
handle
.
get
(),
auto
status
=
miopenExecuteFusionPlan
(
ctx
.
get_stream
().
get_miopen
(),
fp
.
get
(),
fp
.
get
(),
x_td
.
get
(),
x_td
.
get
(),
x
.
implicit
(),
x
.
implicit
(),
...
@@ -152,9 +153,9 @@ struct hip_triadd
...
@@ -152,9 +153,9 @@ struct hip_triadd
check_shapes
{
inputs
,
*
this
}.
has
(
4
);
check_shapes
{
inputs
,
*
this
}.
has
(
4
);
return
inputs
.
front
();
return
inputs
.
front
();
}
}
argument
compute
(
context
&
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
argument
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
{
device
::
add
(
args
.
at
(
3
),
args
.
at
(
0
),
args
.
at
(
1
),
args
.
at
(
2
));
device
::
add
(
ctx
.
get_stream
().
get
(),
args
.
at
(
3
),
args
.
at
(
0
),
args
.
at
(
1
),
args
.
at
(
2
));
return
args
.
at
(
3
);
return
args
.
at
(
3
);
}
}
};
};
...
@@ -167,9 +168,9 @@ struct hip_triadd_relu
...
@@ -167,9 +168,9 @@ struct hip_triadd_relu
check_shapes
{
inputs
,
*
this
}.
has
(
4
);
check_shapes
{
inputs
,
*
this
}.
has
(
4
);
return
inputs
.
front
();
return
inputs
.
front
();
}
}
argument
compute
(
context
&
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
argument
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
{
device
::
add_relu
(
args
.
at
(
3
),
args
.
at
(
0
),
args
.
at
(
1
),
args
.
at
(
2
));
device
::
add_relu
(
ctx
.
get_stream
().
get
(),
args
.
at
(
3
),
args
.
at
(
0
),
args
.
at
(
1
),
args
.
at
(
2
));
return
args
.
at
(
3
);
return
args
.
at
(
3
);
}
}
};
};
...
@@ -182,9 +183,9 @@ struct hip_add_relu
...
@@ -182,9 +183,9 @@ struct hip_add_relu
check_shapes
{
inputs
,
*
this
}.
has
(
3
);
check_shapes
{
inputs
,
*
this
}.
has
(
3
);
return
inputs
.
front
();
return
inputs
.
front
();
}
}
argument
compute
(
context
&
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
argument
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
{
device
::
add_relu
(
args
.
at
(
2
),
args
.
at
(
0
),
args
.
at
(
1
));
device
::
add_relu
(
ctx
.
get_stream
().
get
(),
args
.
at
(
2
),
args
.
at
(
0
),
args
.
at
(
1
));
return
args
.
at
(
2
);
return
args
.
at
(
2
);
}
}
};
};
...
...
src/targets/gpu/gemm.cpp
View file @
394a7495
...
@@ -26,7 +26,7 @@ argument miopen_gemm::compute(context& ctx,
...
@@ -26,7 +26,7 @@ argument miopen_gemm::compute(context& ctx,
rocblas_int
m
=
output_shape
.
lens
()[
0
];
rocblas_int
m
=
output_shape
.
lens
()[
0
];
rocblas_int
n
=
output_shape
.
lens
()[
1
];
rocblas_int
n
=
output_shape
.
lens
()[
1
];
rocblas_int
k
=
args
[
0
].
get_shape
().
lens
()[
1
];
rocblas_int
k
=
args
[
0
].
get_shape
().
lens
()[
1
];
rocblas_sgemm
(
ctx
.
rbhandle
.
get
(),
rocblas_sgemm
(
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
n
,
...
...
src/targets/gpu/hip.cpp
View file @
394a7495
...
@@ -38,14 +38,6 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
...
@@ -38,14 +38,6 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
return
hip_ptr
{
result
};
return
hip_ptr
{
result
};
}
}
template
<
class
T
>
hip_ptr
write_to_gpu
(
const
T
&
x
)
{
using
type
=
typename
T
::
value_type
;
auto
size
=
x
.
size
()
*
sizeof
(
type
);
return
write_to_gpu
(
x
.
data
(),
size
);
}
template
<
class
T
>
template
<
class
T
>
std
::
vector
<
T
>
read_from_gpu
(
const
void
*
x
,
std
::
size_t
sz
)
std
::
vector
<
T
>
read_from_gpu
(
const
void
*
x
,
std
::
size_t
sz
)
{
{
...
@@ -65,6 +57,14 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
...
@@ -65,6 +57,14 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
return
result
;
return
result
;
}
}
template
<
class
T
>
hip_ptr
write_to_gpu
(
const
T
&
x
)
{
using
type
=
typename
T
::
value_type
;
auto
size
=
x
.
size
()
*
sizeof
(
type
);
return
write_to_gpu
(
x
.
data
(),
size
);
}
argument
allocate_gpu
(
const
shape
&
s
,
bool
host
)
argument
allocate_gpu
(
const
shape
&
s
,
bool
host
)
{
{
auto
p
=
share
(
allocate_gpu
(
s
.
bytes
()
+
1
,
host
));
auto
p
=
share
(
allocate_gpu
(
s
.
bytes
()
+
1
,
host
));
...
@@ -88,6 +88,13 @@ argument from_gpu(argument arg)
...
@@ -88,6 +88,13 @@ argument from_gpu(argument arg)
return
result
;
return
result
;
}
}
void
set_device
(
std
::
size_t
id
)
{
auto
status
=
hipSetDevice
(
id
);
if
(
status
!=
hipSuccess
)
MIGRAPH_THROW
(
"Error setting device"
);
}
void
gpu_sync
()
{
hipDeviceSynchronize
();
}
void
gpu_sync
()
{
hipDeviceSynchronize
();
}
void
copy_to_gpu
(
argument
src
,
argument
dst
)
void
copy_to_gpu
(
argument
src
,
argument
dst
)
...
...
src/targets/gpu/include/migraph/gpu/context.hpp
View file @
394a7495
...
@@ -4,17 +4,114 @@
...
@@ -4,17 +4,114 @@
#include <migraph/gpu/miopen.hpp>
#include <migraph/gpu/miopen.hpp>
#include <migraph/gpu/rocblas.hpp>
#include <migraph/gpu/rocblas.hpp>
#include <migraph/gpu/hip.hpp>
#include <migraph/gpu/hip.hpp>
#include <migraph/env.hpp>
namespace
migraph
{
namespace
migraph
{
namespace
gpu
{
namespace
gpu
{
MIGRAPH_DECLARE_ENV_VAR
(
MIGRAPH_DISABLE_NULL_STREAM
)
struct
hip_device
{
hip_device
()
{
add_stream
();
}
hip_device
(
std
::
size_t
id
)
:
device_id
(
id
)
{
add_stream
();
}
struct
stream
{
using
hip_stream_ptr
=
MIGRAPH_MANAGE_PTR
(
hipStream_t
,
hipStreamDestroy
);
stream
()
{}
stream
(
std
::
size_t
device_number
)
:
id
(
device_number
)
{}
void
setup
()
{
set_device
(
id
);
}
static
hip_stream_ptr
create_stream
()
{
hipStream_t
result
=
nullptr
;
auto
status
=
hipStreamCreate
(
&
result
);
if
(
status
!=
hipSuccess
)
MIGRAPH_THROW
(
"Failed to allocate stream"
);
return
hip_stream_ptr
{
result
};
}
hipStream_t
get
()
{
if
(
enabled
(
MIGRAPH_DISABLE_NULL_STREAM
{}))
{
setup
();
if
(
s
==
nullptr
)
s
=
create_stream
();
assert
(
s
.
get
()
!=
nullptr
);
return
s
.
get
();
}
return
nullptr
;
}
auto
create_miopen_handle
()
{
if
(
enabled
(
MIGRAPH_DISABLE_NULL_STREAM
{}))
return
make_obj
<
miopen_handle
>
(
&
miopenCreateWithStream
,
get
());
else
return
make_obj
<
miopen_handle
>
(
&
miopenCreate
);
}
auto
get_miopen
()
{
setup
();
if
(
mihandle
==
nullptr
)
mihandle
=
create_miopen_handle
();
assert
(
mihandle
.
get
()
!=
nullptr
);
return
mihandle
.
get
();
}
auto
get_rocblas
()
{
setup
();
if
(
rbhandle
==
nullptr
)
rbhandle
=
create_rocblas_handle_ptr
(
get
());
assert
(
rbhandle
.
get
()
!=
nullptr
);
return
rbhandle
.
get
();
}
private:
std
::
size_t
id
=
0
;
shared
<
hip_stream_ptr
>
s
=
nullptr
;
shared
<
miopen_handle
>
mihandle
=
nullptr
;
shared
<
rocblas_handle_ptr
>
rbhandle
=
nullptr
;
};
void
add_stream
()
{
streams
.
emplace_back
(
device_id
);
}
stream
&
get_stream
()
{
return
streams
.
at
(
current_stream
);
}
void
set_stream
(
std
::
size_t
n
)
{
current_stream
=
n
;
}
private:
std
::
size_t
device_id
=
0
;
std
::
size_t
current_stream
=
0
;
std
::
vector
<
stream
>
streams
;
};
struct
context
struct
context
{
{
shared
<
miopen_handle
>
handle
;
context
(
std
::
size_t
n
=
0
)
:
current_device
(
std
::
make_shared
<
hip_device
>
(
n
))
{}
shared
<
rocblas_handle_ptr
>
rbhandle
;
argument
scratch
;
hip_device
&
get_current_device
()
{
assert
(
current_device
!=
nullptr
);
return
*
current_device
;
}
hip_device
::
stream
&
get_stream
()
{
return
get_current_device
().
get_stream
();
}
std
::
vector
<
argument
>
literals
{};
std
::
vector
<
argument
>
literals
{};
void
finish
()
const
{
gpu_sync
();
}
void
finish
()
const
{
gpu_sync
();
}
private:
// TODO: Make this a vector to support multiple devices
std
::
shared_ptr
<
hip_device
>
current_device
;
};
};
}
// namespace gpu
}
// namespace gpu
}
// namespace migraph
}
// namespace migraph
...
...
src/targets/gpu/include/migraph/gpu/device/add.hpp
View file @
394a7495
...
@@ -3,14 +3,19 @@
...
@@ -3,14 +3,19 @@
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_HPP
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_HPP
#include <migraph/argument.hpp>
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraph
{
namespace
migraph
{
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
void
add
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
);
void
add
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
);
void
add
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
);
void
add
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
);
}
// namespace device
}
// namespace device
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/include/migraph/gpu/device/add_relu.hpp
View file @
394a7495
...
@@ -3,14 +3,19 @@
...
@@ -3,14 +3,19 @@
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
#define MIGRAPH_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
#include <migraph/argument.hpp>
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraph
{
namespace
migraph
{
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
void
add_relu
(
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
);
void
add_relu
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
);
void
add_relu
(
const
argument
&
result
,
void
add_relu
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg2
,
const
argument
&
arg3
);
const
argument
&
arg3
);
...
...
src/targets/gpu/include/migraph/gpu/device/concat.hpp
View file @
394a7495
#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#define MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#define MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraph
{
namespace
migraph
{
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
argument
argument
concat
(
hipStream_t
stream
,
concat
(
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
,
std
::
vector
<
std
::
size_t
>
offsets
);
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
,
std
::
vector
<
std
::
size_t
>
offsets
);
}
// namespace device
}
// namespace device
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/include/migraph/gpu/device/contiguous.hpp
View file @
394a7495
...
@@ -2,12 +2,13 @@
...
@@ -2,12 +2,13 @@
#define MIGRAPH_GUARD_MIGRAPHLIB_KERNELS_HPP
#define MIGRAPH_GUARD_MIGRAPHLIB_KERNELS_HPP
#include <migraph/argument.hpp>
#include <migraph/argument.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraph
{
namespace
migraph
{
namespace
gpu
{
namespace
gpu
{
namespace
device
{
namespace
device
{
void
contiguous
(
argument
result
,
argument
arg
);
void
contiguous
(
hipStream_t
stream
,
argument
result
,
argument
arg
);
}
// namespace device
}
// namespace device
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/include/migraph/gpu/hip.hpp
View file @
394a7495
...
@@ -13,6 +13,8 @@ migraph::argument to_gpu(migraph::argument arg, bool host = false);
...
@@ -13,6 +13,8 @@ migraph::argument to_gpu(migraph::argument arg, bool host = false);
migraph
::
argument
from_gpu
(
migraph
::
argument
arg
);
migraph
::
argument
from_gpu
(
migraph
::
argument
arg
);
void
set_device
(
std
::
size_t
id
);
void
gpu_sync
();
void
gpu_sync
();
void
copy_to_gpu
(
argument
src
,
argument
dst
);
void
copy_to_gpu
(
argument
src
,
argument
dst
);
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment