Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
6b4c86ab
Commit
6b4c86ab
authored
Oct 04, 2022
by
Paul
Browse files
Merge
parents
8dfd08e1
f7d987ba
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
809 additions
and
130 deletions
+809
-130
src/targets/gpu/include/migraphx/gpu/convolution.hpp
src/targets/gpu/include/migraphx/gpu/convolution.hpp
+8
-1
src/targets/gpu/include/migraphx/gpu/miopen.hpp
src/targets/gpu/include/migraphx/gpu/miopen.hpp
+28
-0
src/targets/gpu/include/migraphx/gpu/rocblas.hpp
src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+5
-1
src/targets/gpu/jit/gathernd.cpp
src/targets/gpu/jit/gathernd.cpp
+1
-1
src/targets/gpu/jit/softmax.cpp
src/targets/gpu/jit/softmax.cpp
+5
-0
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+8
-8
src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+9
-5
src/targets/gpu/lowering.cpp
src/targets/gpu/lowering.cpp
+34
-17
src/targets/gpu/rocblas.cpp
src/targets/gpu/rocblas.cpp
+33
-0
src/value.cpp
src/value.cpp
+1
-8
test/api/test_custom_op.cpp
test/api/test_custom_op.cpp
+43
-0
test/api/test_custom_op_gpu.cpp
test/api/test_custom_op_gpu.cpp
+258
-37
test/api/test_gpu.cpp
test/api/test_gpu.cpp
+60
-2
test/gpu/mlir.cpp
test/gpu/mlir.cpp
+0
-4
test/gpu/pack_int8_args.cpp
test/gpu/pack_int8_args.cpp
+49
-46
test/gpu/stream_sync.cpp
test/gpu/stream_sync.cpp
+146
-0
test/py/CMakeLists.txt
test/py/CMakeLists.txt
+1
-0
test/py/test_gpu_async.py
test/py/test_gpu_async.py
+74
-0
test/verify/test_softmax_large3.cpp
test/verify/test_softmax_large3.cpp
+43
-0
tools/accuracy/accuracy_checker.py
tools/accuracy/accuracy_checker.py
+3
-0
No files found.
src/targets/gpu/include/migraphx/gpu/convolution.hpp
View file @
6b4c86ab
...
...
@@ -39,6 +39,10 @@ struct miopen_convolution
op
::
convolution
op
;
shared
<
convolution_descriptor
>
cd
=
nullptr
;
miopenConvFwdAlgorithm_t
algo
{};
#ifdef MIGRAPHX_HAS_FIND_2_API
value
::
binary
solution_object
{};
shared
<
miopen_solution
>
solution_ptr
=
nullptr
;
#endif
uint64_t
solution_id
=
0
;
template
<
class
Self
,
class
F
>
...
...
@@ -49,6 +53,9 @@ struct miopen_convolution
f
(
self
.
op
.
dilation
,
"dilation"
),
f
(
self
.
op
.
group
,
"group"
),
f
(
self
.
op
.
padding_mode
,
"padding_mode"
),
#ifdef MIGRAPHX_HAS_FIND_2_API
f
(
self
.
solution_object
,
"solution_object"
),
#endif
f
(
self
.
solution_id
,
"solution_id"
));
}
...
...
@@ -57,7 +64,7 @@ struct miopen_convolution
argument
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
;
shape
find
(
context
&
ctx
,
const
shape
&
output_shape
,
std
::
vector
<
shape
>
inputs
);
void
finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
std
::
vector
<
shape
>
inputs
);
void
finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>
&
inputs
);
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
...
...
src/targets/gpu/include/migraphx/gpu/miopen.hpp
View file @
6b4c86ab
...
...
@@ -70,6 +70,34 @@ Result make_obj(F f, Ts... xs)
return
r
;
}
#ifdef MIGRAPHX_HAS_FIND_2_API
using
miopen_find_options
=
MIGRAPHX_MANAGE_PTR
(
miopenFindOptions_t
,
miopenDestroyFindOptions
);
using
miopen_problem
=
MIGRAPHX_MANAGE_PTR
(
miopenProblem_t
,
miopenDestroyProblem
);
using
miopen_solution
=
MIGRAPHX_MANAGE_PTR
(
miopenSolution_t
,
miopenDestroySolution
);
inline
miopen_solution
find_solution
(
miopenHandle_t
handle
,
miopenProblem_t
problem
)
{
miopenSolution_t
solution
;
size_t
found
=
0
;
auto
status
=
miopenFindSolutions
(
handle
,
problem
,
nullptr
,
&
solution
,
&
found
,
1
);
auto
result
=
miopen_solution
{
solution
};
if
(
status
!=
miopenStatusSuccess
or
found
==
0
)
MIGRAPHX_THROW
(
"MIOpen miopenFindSolutions failed"
);
return
result
;
}
inline
void
set_tensor_descriptor
(
miopenTensorArgumentId_t
name
,
tensor_descriptor
&
desc
,
miopen_problem
&
problem_ptr
)
{
auto
status
=
miopenSetProblemTensorDescriptor
(
problem_ptr
.
get
(),
name
,
desc
.
get
());
if
(
status
!=
miopenStatusSuccess
)
{
MIGRAPHX_THROW
(
"setting problem tensor description failed"
);
}
}
#endif
inline
tensor_descriptor
make_tensor
(
const
migraphx
::
shape
&
os
,
bool
pack
=
false
)
{
auto
s
=
os
.
normalize_standard
();
...
...
src/targets/gpu/include/migraphx/gpu/rocblas.hpp
View file @
6b4c86ab
...
...
@@ -23,7 +23,6 @@
*/
#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
#define MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
#include <migraphx/manage_ptr.hpp>
#include <migraphx/config.hpp>
#include <rocblas.h>
...
...
@@ -37,6 +36,11 @@ using rocblas_handle_ptr = MIGRAPHX_MANAGE_PTR(rocblas_handle, rocblas_destroy_h
rocblas_handle_ptr
create_rocblas_handle_ptr
();
rocblas_handle_ptr
create_rocblas_handle_ptr
(
hipStream_t
s
);
struct
context
;
bool
get_compute_fp32_flag
();
bool
get_int8_x4_format
(
context
&
ctx
);
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
...
...
src/targets/gpu/jit/gathernd.cpp
View file @
6b4c86ab
...
...
@@ -65,7 +65,7 @@ struct gathernd_compiler : compiler<gathernd_compiler>
operation
compile_op
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
const
value
&
v
)
const
{
hip_compile_options
options
;
auto
out_s
=
inputs
.
back
();
const
auto
&
out_s
=
inputs
.
back
();
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
out_s
.
elements
()));
options
.
inputs
=
inputs
;
options
.
output
=
out_s
;
...
...
src/targets/gpu/jit/softmax.cpp
View file @
6b4c86ab
...
...
@@ -32,6 +32,8 @@ namespace migraphx {
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_USE_FAST_SOFTMAX
)
using
namespace
migraphx
::
gpu
::
gen
;
// NOLINT
static
const
char
*
const
softmax_kernel
=
R"__migraphx__(
...
...
@@ -81,6 +83,9 @@ struct softmax_compiler : compiler<softmax_compiler>
options
.
inputs
=
inputs
;
options
.
kernel_name
=
"softmax_kernel"
;
if
(
enabled
(
MIGRAPHX_USE_FAST_SOFTMAX
{}))
options
.
params
=
"-DMIGRAPHX_USE_FAST_SOFTMAX"
;
auto
src
=
interpolate_string
(
softmax_kernel
,
{{
"transformers"
,
make_transformer_args
(
vec
)},
{
"axis"
,
to_string
(
axis
)}});
...
...
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
View file @
6b4c86ab
...
...
@@ -197,11 +197,11 @@ struct block
struct
reducer
{
index
idx
;
Slicer
slice
r
;
Slicer
slice
;
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
return
sliced
(
slice
r
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
block_reduce
(
idx
,
op
,
init
,
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
return
vec_reduce
(
read
(
x
[
j
],
xs
[
j
]...),
op
);
});
...
...
@@ -218,7 +218,7 @@ struct block
template
<
class
F
>
__device__
auto
inner
(
F
f
)
const
{
return
sliced
(
slice
r
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
idx
.
local_stride
(
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
f
(
x
[
j
],
xs
[
j
]...);
});
});
}
...
...
@@ -226,7 +226,7 @@ struct block
template
<
class
Input
>
constexpr
auto
elements
()
const
{
using
reduce_type
=
decltype
(
slice
r
(
Input
{}));
using
reduce_type
=
decltype
(
slice
(
Input
{}));
using
value_type
=
typename
Input
::
type
;
constexpr
auto
relements
=
get_shape_c
<
reduce_type
>
{}.
elements
();
if
constexpr
(
vec_size
<
value_type
>
()
>
1
)
...
...
@@ -260,11 +260,11 @@ struct lane
struct
reducer
{
index
idx
;
Slicer
slice
r
;
Slicer
slice
;
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
return
sliced
(
slice
r
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
using
type
=
typename
decltype
(
x
)
::
type
;
type
r
=
init
;
for
(
index_int
j
=
0
;
j
<
x
.
get_shape
().
elements
();
j
++
)
...
...
@@ -284,7 +284,7 @@ struct lane
template
<
class
F
>
__device__
auto
inner
(
F
f
)
const
{
return
sliced
(
slice
r
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
for
(
index_int
j
=
0
;
j
<
x
.
get_shape
().
elements
();
j
++
)
{
f
(
x
[
j
],
xs
[
j
]...);
...
...
@@ -295,7 +295,7 @@ struct lane
template
<
class
Input
>
constexpr
auto
elements
()
const
{
using
reduce_type
=
decltype
(
slice
r
(
Input
{}));
using
reduce_type
=
decltype
(
slice
(
Input
{}));
return
get_shape_c
<
reduce_type
>
{}.
elements
();
}
};
...
...
src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
View file @
6b4c86ab
...
...
@@ -33,11 +33,15 @@ template <index_int Axis, class Input, class Output>
__device__
void
softmax
(
Input
input
,
Output
output
)
{
reduce
::
block
::
run
<
reduce
::
with_axis
<
Input
,
Axis
>>
([
&
](
auto
,
auto
r
)
{
auto
batch_max
=
r
.
reduce
(
op
::
max
{},
lowest
{},
op
::
id
{})(
input
);
auto
batch_sum
=
r
.
reduce
(
op
::
sum
{},
0
,
[
&
](
auto
x
)
{
return
migraphx
::
exp
(
x
-
batch_max
);
})(
input
);
r
.
inner
([
&
](
auto
&
y
,
auto
x
)
{
y
=
migraphx
::
exp
(
x
-
batch_max
)
/
batch_sum
;
})(
output
,
input
);
#ifdef MIGRAPHX_USE_FAST_SOFTMAX
const
auto
c
=
vec_at
(
r
.
slice
(
input
)[
0
],
0
);
#else
const
auto
c
=
r
.
reduce
(
op
::
max
{},
lowest
{},
op
::
id
{})(
input
);
#endif
auto
batch_sum
=
r
.
reduce
(
op
::
sum
{},
0
,
[
&
](
auto
x
)
{
return
migraphx
::
convert
<
float
>
(
migraphx
::
exp
(
x
-
c
));
})(
input
);
r
.
inner
([
&
](
auto
&
y
,
auto
x
)
{
y
=
migraphx
::
exp
(
x
-
c
)
/
batch_sum
;
})(
output
,
input
);
});
}
...
...
src/targets/gpu/lowering.cpp
View file @
6b4c86ab
...
...
@@ -26,6 +26,8 @@
#include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/instruction_ref.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/deconvolution.hpp>
...
...
@@ -81,26 +83,14 @@ struct miopen_apply
(
void
)
i
;
}
const
std
::
unordered_set
<
std
::
string
>&
get_rocblas_fp32_archs
()
{
static
std
::
unordered_set
<
std
::
string
>
supported_archs
{
"gfx908"
,
"gfx90a"
};
return
supported_archs
;
}
void
init
()
{
assert
(
mod
!=
nullptr
);
assert
(
pass
!=
nullptr
);
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
auto
&
ctx
=
get_context
();
const
auto
device_name
=
trim
(
split_string
(
get_device_name
(),
':'
).
front
());
if
(
contains
(
get_rocblas_fp32_archs
(),
device_name
))
compute_fp32
=
true
;
rocblas_gemm_flags
flag
;
rocblas_query_int8_layout_flag
(
ctx
.
get_stream
().
get_rocblas
(),
&
flag
);
int8_x4_format
=
(
flag
==
rocblas_gemm_flags_pack_int8x4
);
#endif
auto
&
ctx
=
get_context
();
int8_x4_format
=
get_int8_x4_format
(
ctx
);
compute_fp32
=
get_compute_fp32_flag
();
offload_copy
=
(
mod
->
name
()
==
"main"
)
?
pass
->
offload_copy
:
false
;
...
...
@@ -183,7 +173,8 @@ struct miopen_apply
init
();
for
(
auto
it
=
mod
->
begin
();
it
!=
mod
->
end
();
it
++
)
{
auto
s
=
it
->
get_shape
();
auto
s
=
it
->
get_shape
();
auto
attrs
=
it
->
get_operator
().
attributes
();
if
(
apply_map
.
count
(
it
->
name
())
>
0
)
{
check_shape
(
s
,
apply_map
.
at
(
it
->
name
())(
it
));
...
...
@@ -192,11 +183,37 @@ struct miopen_apply
{
check_shape
(
s
,
insert_precompile_op
(
it
));
}
else
if
(
attrs
.
contains
(
"target"
))
{
check_shape
(
s
,
insert_custom_op
(
it
,
attrs
));
}
}
copy_params
();
}
instruction_ref
insert_custom_op
(
instruction_ref
ins
,
const
value
&
attrs
)
const
{
const
auto
&
custom_op
=
ins
->
get_operator
();
if
(
attrs
.
at
(
"target"
)
==
"cpu"
)
{
auto
s
=
ins
->
get_shape
();
std
::
vector
<
instruction_ref
>
cpu_inputs
;
auto
inputs
=
ins
->
inputs
();
auto
output
=
inputs
.
back
();
std
::
transform
(
inputs
.
begin
(),
inputs
.
end
(),
std
::
back_inserter
(
cpu_inputs
),
[
&
](
auto
in
)
{
return
mod
->
insert_instruction
(
ins
,
make_op
(
"hip::copy_from_gpu"
),
in
);
});
cpu_inputs
.
front
()
=
mod
->
insert_instruction
(
ins
,
make_op
(
"hip::sync_stream"
),
cpu_inputs
);
auto
cpu_out
=
mod
->
insert_instruction
(
ins
,
custom_op
,
cpu_inputs
);
auto
gpu_out
=
mod
->
insert_instruction
(
ins
,
make_op
(
"hip::copy_to_gpu"
),
cpu_out
,
output
);
return
mod
->
replace_instruction
(
ins
,
gpu_out
);
}
return
ins
;
}
instruction_ref
insert_precompile_op
(
instruction_ref
ins
)
const
{
auto
output
=
insert_allocation
(
ins
,
ins
->
get_shape
());
...
...
src/targets/gpu/rocblas.cpp
View file @
6b4c86ab
...
...
@@ -21,7 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <unordered_set>
#include <migraphx/ranges.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -41,6 +47,33 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s)
return
rb
;
}
const
std
::
unordered_set
<
std
::
string
>&
get_rocblas_fp32_archs
()
{
static
std
::
unordered_set
<
std
::
string
>
supported_archs
{
"gfx908"
,
"gfx90a"
};
return
supported_archs
;
}
bool
get_compute_fp32_flag
()
{
bool
compute_fp32
=
false
;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
const
auto
device_name
=
trim
(
split_string
(
get_device_name
(),
':'
).
front
());
if
(
contains
(
get_rocblas_fp32_archs
(),
device_name
))
compute_fp32
=
true
;
#endif
return
compute_fp32
;
}
bool
get_int8_x4_format
(
context
&
ctx
)
{
bool
int8_x4_format
=
true
;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
rocblas_gemm_flags
flag
;
rocblas_query_int8_layout_flag
(
ctx
.
get_stream
().
get_rocblas
(),
&
flag
);
int8_x4_format
=
(
flag
==
rocblas_gemm_flags_pack_int8x4
);
#endif
return
int8_x4_format
;
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/value.cpp
View file @
6b4c86ab
...
...
@@ -511,14 +511,7 @@ void print_value(std::ostream& os, const std::vector<value>& x)
os
<<
"}"
;
}
void
print_value
(
std
::
ostream
&
os
,
const
value
::
binary
&
x
)
{
// Convert binary to integers
std
::
vector
<
int
>
v
(
x
.
begin
(),
x
.
end
());
os
<<
"{"
;
os
<<
to_string_range
(
v
);
os
<<
"}"
;
}
void
print_value
(
std
::
ostream
&
os
,
const
value
::
binary
&
x
)
{
os
<<
x
;
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
value
&
d
)
{
...
...
test/api/test_custom_op.cpp
View file @
6b4c86ab
...
...
@@ -43,6 +43,8 @@ struct sigmoid_custom_op final : migraphx::experimental_custom_op_base
return
inputs
[
1
];
}
virtual
bool
runs_on_offload_target
()
const
override
{
return
true
;
}
virtual
migraphx
::
shape
compute_shape
(
migraphx
::
shapes
inputs
)
const
override
{
if
(
inputs
.
size
()
!=
2
)
...
...
@@ -111,4 +113,45 @@ TEST_CASE(run_sigmoid_with_incorrect_shape)
"Error in compute_shape of: sigmoid_custom_op: op must have two inputs"
));
}
struct
identity_custom_op
final
:
migraphx
::
experimental_custom_op_base
{
virtual
std
::
string
name
()
const
override
{
return
"identity_custom_op"
;
}
virtual
migraphx
::
argument
compute
(
migraphx
::
context
,
migraphx
::
shape
,
migraphx
::
arguments
inputs
)
const
override
{
return
inputs
[
0
];
}
virtual
bool
runs_on_offload_target
()
const
override
{
return
true
;
}
virtual
migraphx
::
shape
compute_shape
(
migraphx
::
shapes
inputs
)
const
override
{
if
(
inputs
.
size
()
!=
1
)
{
throw
std
::
runtime_error
(
"Identity op must have only one input"
);
}
return
inputs
.
back
();
}
virtual
std
::
vector
<
size_t
>
output_alias
(
migraphx
::
shapes
)
const
override
{
return
{
0
,
1
};
}
};
TEST_CASE
(
run_custom_op_with_invalid_output_alias
)
{
identity_custom_op
i_op
;
migraphx
::
register_experimental_custom_op
(
i_op
);
auto
op
=
migraphx
::
operation
(
"identity_custom_op"
);
EXPECT
(
op
.
name
()
==
"identity_custom_op"
);
migraphx
::
program
p
;
migraphx
::
shape
s
{
migraphx_shape_float_type
,
{
12
}};
migraphx
::
module
m
=
p
.
get_main_module
();
auto
x
=
m
.
add_parameter
(
"x"
,
s
);
auto
i_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"identity_custom_op"
),
{
x
});
migraphx_test_private_disable_exception_catch
(
true
);
EXPECT
(
test
::
throws
<
std
::
exception
>
(
[
&
]
{
p
.
compile
(
migraphx
::
target
(
"ref"
));
},
"Currently, CustomOps in MIGraphX only supports one output_alias"
));
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
test/api/test_custom_op_gpu.cpp
View file @
6b4c86ab
...
...
@@ -24,40 +24,89 @@
#include <hip/hip_runtime_api.h>
#include <migraphx/migraphx.h>
#include <migraphx/migraphx.hpp>
#include <numeric>
#include <stdexcept>
#include "test.hpp"
#define MIGRAPHX_HIP_ASSERT(x) (EXPECT(x == hipSuccess))
struct
simple_custom_op
final
:
migraphx
::
experimental_custom_op_base
struct
half_copy_host
final
:
migraphx
::
experimental_custom_op_base
{
virtual
std
::
string
name
()
const
override
{
return
"simple_custom_op"
;
}
virtual
std
::
string
name
()
const
override
{
return
"half_copy_host"
;
}
virtual
bool
runs_on_offload_target
()
const
override
{
return
false
;
}
virtual
migraphx
::
argument
compute
(
migraphx
::
context
ctx
,
migraphx
::
shape
,
migraphx
::
arguments
inputs
)
const
override
{
// sets first half size_bytes of the input 0, and rest of the half bytes are copied.
int
*
h_output
=
nullptr
;
auto
*
d_output
=
reinterpret_cast
<
int
*>
(
inputs
[
0
].
data
());
auto
input_bytes
=
inputs
[
0
].
get_shape
().
bytes
();
auto
*
output_ptr
=
inputs
[
1
].
data
();
auto
copy_bytes
=
input_bytes
/
2
;
// This custom op simply sets first half size_bytes of the input to 0, and rest of the half
// bytes are copied. for this custom_op, it does its computation on the host. Therefore,
// `runs_on_offload_target()` is set to false. MIGraphX would inject necessary buffer copies
// to and from GPU to Host based on `runs_on_offload_targe()` flag for input buffers as well
// as the output buffers
auto
*
input_buffer_ptr
=
inputs
[
0
].
data
();
auto
*
output_buffer_ptr
=
inputs
[
1
].
data
();
auto
input_bytes
=
inputs
[
0
].
get_shape
().
bytes
();
auto
copy_bytes
=
input_bytes
/
2
;
MIGRAPHX_HIP_ASSERT
(
hipSetDevice
(
0
));
MIGRAPHX_HIP_ASSERT
(
hipHostMalloc
(
&
h_output
,
input_bytes
));
MIGRAPHX_HIP_ASSERT
(
hipMemcpyAsync
(
h_output
,
d_output
,
input_bytes
,
hipMemcpyDeviceToHost
,
ctx
.
get_queue
<
hipStream_t
>
()));
MIGRAPHX_HIP_ASSERT
(
hipMemcpyAsync
(
output_buffer_ptr
,
input_buffer_ptr
,
input_bytes
,
hipMemcpyHostToHost
,
ctx
.
get_queue
<
hipStream_t
>
()));
MIGRAPHX_HIP_ASSERT
(
hipDeviceSynchronize
());
MIGRAPHX_HIP_ASSERT
(
hipMemset
(
output_buffer_ptr
,
0
,
copy_bytes
));
MIGRAPHX_HIP_ASSERT
(
hipDeviceSynchronize
());
MIGRAPHX_HIP_ASSERT
(
hipMemset
(
h_output
,
0
,
copy_bytes
));
return
inputs
[
1
];
}
virtual
migraphx
::
shape
compute_shape
(
migraphx
::
shapes
inputs
)
const
override
{
if
(
not
inputs
[
0
].
standard
()
or
not
inputs
[
1
].
standard
())
{
throw
std
::
runtime_error
(
"Input args must be standard shaped"
);
}
if
(
inputs
.
size
()
!=
2
)
{
throw
std
::
runtime_error
(
"number of inputs must be 2"
);
}
return
inputs
.
back
();
}
};
struct
half_copy_device
final
:
migraphx
::
experimental_custom_op_base
{
virtual
std
::
string
name
()
const
override
{
return
"half_copy_device"
;
}
virtual
bool
runs_on_offload_target
()
const
override
{
return
true
;
}
virtual
migraphx
::
argument
compute
(
migraphx
::
context
ctx
,
migraphx
::
shape
,
migraphx
::
arguments
inputs
)
const
override
{
// This custom op simply sets first half size_bytes of the input to 0, and rest of the half
// bytes are copied. for this custom_op, it does its computation on the "GPU". Therefore,
// `runs_on_offload_target()` is set to "true".
auto
*
input_buffer_ptr
=
inputs
[
0
].
data
();
auto
*
output_buffer_ptr
=
inputs
[
1
].
data
();
auto
input_bytes
=
inputs
[
0
].
get_shape
().
bytes
();
auto
copy_bytes
=
input_bytes
/
2
;
MIGRAPHX_HIP_ASSERT
(
hipSetDevice
(
0
));
MIGRAPHX_HIP_ASSERT
(
hipMemcpyAsync
(
output_buffer_ptr
,
input_buffer_ptr
,
input_bytes
,
hipMemcpyDeviceToDevice
,
ctx
.
get_queue
<
hipStream_t
>
()));
MIGRAPHX_HIP_ASSERT
(
hipDeviceSynchronize
());
MIGRAPHX_HIP_ASSERT
(
hipMem
cpy
(
output_
ptr
,
h_output
,
input_bytes
,
hipMemcpyHostToDevice
));
MIGRAPHX_HIP_ASSERT
(
hipMem
set
(
output_
buffer_ptr
,
0
,
copy_bytes
));
MIGRAPHX_HIP_ASSERT
(
hipDeviceSynchronize
());
MIGRAPHX_HIP_ASSERT
(
hipHostFree
(
h_output
));
return
inputs
[
1
];
}
virtual
migraphx
::
shape
compute_shape
(
migraphx
::
shapes
inputs
)
const
override
{
if
(
not
inputs
[
0
].
standard
())
if
(
not
inputs
[
0
].
standard
()
or
not
inputs
[
1
].
standard
()
)
{
throw
std
::
runtime_error
(
"
firs
t arg must be standard shaped"
);
throw
std
::
runtime_error
(
"
Inpu
t arg
s
must be standard shaped"
);
}
if
(
inputs
.
size
()
!=
2
)
{
...
...
@@ -67,36 +116,208 @@ struct simple_custom_op final : migraphx::experimental_custom_op_base
}
};
TEST_CASE
(
run_simple_custom_op
)
// overwrites input buffer
struct
half_copy_device_same_buffer
final
:
migraphx
::
experimental_custom_op_base
{
virtual
std
::
string
name
()
const
override
{
return
"half_copy_device_same_buffer"
;
}
virtual
bool
runs_on_offload_target
()
const
override
{
return
true
;
}
virtual
migraphx
::
argument
compute
(
migraphx
::
context
,
migraphx
::
shape
,
migraphx
::
arguments
inputs
)
const
override
{
// This custom op simply sets first half size_bytes of the input 0, and rest of the half
// bytes are copied. for this custom_op, it does its computation on the "device". Therefore,
// `runs_on_offload_target()` is set to "true"
auto
*
buffer_ptr
=
inputs
[
0
].
data
();
auto
input_bytes
=
inputs
[
0
].
get_shape
().
bytes
();
auto
copy_bytes
=
input_bytes
/
2
;
MIGRAPHX_HIP_ASSERT
(
hipSetDevice
(
0
));
MIGRAPHX_HIP_ASSERT
(
hipMemset
(
buffer_ptr
,
0
,
copy_bytes
));
MIGRAPHX_HIP_ASSERT
(
hipDeviceSynchronize
());
return
inputs
[
0
];
}
virtual
migraphx
::
shape
compute_shape
(
migraphx
::
shapes
inputs
)
const
override
{
if
(
not
inputs
[
0
].
standard
())
{
throw
std
::
runtime_error
(
"Input arg must be standard shaped"
);
}
return
inputs
.
front
();
}
};
TEST_CASE
(
register_half_copy_op
)
{
half_copy_host
hch
;
migraphx
::
register_experimental_custom_op
(
hch
);
auto
op
=
migraphx
::
operation
(
"half_copy_host"
);
EXPECT
(
op
.
name
()
==
"half_copy_host"
);
half_copy_device
hcd
;
migraphx
::
register_experimental_custom_op
(
hcd
);
op
=
migraphx
::
operation
(
"half_copy_device"
);
EXPECT
(
op
.
name
()
==
"half_copy_device"
);
half_copy_device_same_buffer
hcdsb
;
migraphx
::
register_experimental_custom_op
(
hcdsb
);
op
=
migraphx
::
operation
(
"half_copy_device_same_buffer"
);
EXPECT
(
op
.
name
()
==
"half_copy_device_same_buffer"
);
}
TEST_CASE
(
half_copy_custom_op_test
)
{
auto
run_test_prog
=
[](
const
std
::
string
&
op_name
,
bool
buffer_alloc
)
{
migraphx
::
program
p
;
migraphx
::
module
m
=
p
.
get_main_module
();
migraphx
::
shape
s
{
migraphx_shape_float_type
,
{
4
,
3
}};
auto
x
=
m
.
add_parameter
(
"x"
,
s
);
migraphx
::
instructions
inputs
=
{
x
};
if
(
buffer_alloc
)
{
auto
alloc
=
m
.
add_allocation
(
s
);
inputs
=
{
x
,
alloc
};
}
auto
half_copy_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
op_name
.
c_str
()),
inputs
);
m
.
add_return
({
half_copy_ins
});
migraphx
::
compile_options
options
;
options
.
set_offload_copy
();
p
.
compile
(
migraphx
::
target
(
"gpu"
),
options
);
migraphx
::
program_parameters
pp
;
std
::
vector
<
float
>
x_data
(
12
);
std
::
iota
(
x_data
.
begin
(),
x_data
.
end
(),
0
);
pp
.
add
(
"x"
,
migraphx
::
argument
(
s
,
x_data
.
data
()));
auto
results
=
p
.
eval
(
pp
);
auto
result
=
results
[
0
];
auto
result_vec
=
result
.
as_vector
<
float
>
();
std
::
vector
<
float
>
expected_result
(
12
,
0
);
std
::
iota
(
expected_result
.
begin
()
+
6
,
expected_result
.
end
(),
6
);
EXPECT
(
bool
{
result
==
migraphx
::
argument
(
s
,
expected_result
.
data
())});
};
// register all the ops
half_copy_host
hch
;
migraphx
::
register_experimental_custom_op
(
hch
);
half_copy_device
hcd
;
migraphx
::
register_experimental_custom_op
(
hcd
);
half_copy_device_same_buffer
hcdsb
;
migraphx
::
register_experimental_custom_op
(
hcdsb
);
std
::
vector
<
std
::
pair
<
std
::
string
,
bool
>>
tests_config
=
{
{
"half_copy_host"
,
true
},
{
"half_copy_device"
,
true
},
{
"half_copy_device_same_buffer"
,
false
}};
for
(
const
auto
&
i
:
tests_config
)
{
run_test_prog
(
i
.
first
,
i
.
second
);
}
}
struct
stride_two
final
:
migraphx
::
experimental_custom_op_base
{
virtual
std
::
string
name
()
const
override
{
return
"stride_two"
;
}
virtual
migraphx
::
argument
compute
(
migraphx
::
context
,
migraphx
::
shape
out_shape
,
migraphx
::
arguments
inputs
)
const
override
{
return
{
out_shape
,
inputs
[
0
].
data
()};
}
virtual
migraphx
::
shape
compute_shape
(
migraphx
::
shapes
inputs
)
const
override
{
if
(
inputs
.
size
()
!=
1
)
{
throw
std
::
runtime_error
(
"stride_two op must have only one input argument"
);
};
if
(
not
inputs
[
0
].
standard
())
{
throw
std
::
runtime_error
(
"stride_two op only works on the standard input shapes"
);
}
migraphx
::
shape
input_s
=
inputs
[
0
];
std
::
vector
<
size_t
>
dims
=
input_s
.
lengths
();
std
::
vector
<
size_t
>
new_dims
;
std
::
vector
<
size_t
>
strides
=
input_s
.
strides
();
std
::
vector
<
size_t
>
new_strides
;
std
::
for_each
(
dims
.
begin
(),
dims
.
end
(),
[
&
](
auto
i
)
{
new_dims
.
push_back
(
i
/
2
);
});
std
::
for_each
(
strides
.
begin
(),
strides
.
end
(),
[
&
](
auto
i
)
{
new_strides
.
push_back
(
i
*
2
);
});
migraphx
::
shape
output_shape
{
input_s
.
type
(),
new_dims
,
new_strides
};
return
output_shape
;
}
virtual
bool
runs_on_offload_target
()
const
override
{
return
true
;
}
virtual
std
::
vector
<
size_t
>
output_alias
(
migraphx
::
shapes
)
const
override
{
return
{
0
};
};
};
TEST_CASE
(
stride_two_custom_op_test
)
{
simple_custom_op
simple_op
;
migraphx
::
register_experimental_custom_op
(
simple_op
);
stride_two
st
;
migraphx
::
register_experimental_custom_op
(
st
);
migraphx
::
program
p
;
migraphx
::
module
m
=
p
.
get_main_module
();
migraphx
::
shape
s
{
migraphx_shape_float_type
,
{
4
,
4
,
4
}};
auto
x
=
m
.
add_parameter
(
"x"
,
s
);
auto
stride_two_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"stride_two"
),
{
x
});
m
.
add_return
({
stride_two_ins
});
migraphx
::
compile_options
options
;
options
.
set_offload_copy
();
p
.
compile
(
migraphx
::
target
(
"gpu"
),
options
);
migraphx
::
program_parameters
pp
;
std
::
vector
<
float
>
x_data
(
64
);
std
::
iota
(
x_data
.
begin
(),
x_data
.
end
(),
0
);
pp
.
add
(
"x"
,
migraphx
::
argument
(
s
,
x_data
.
data
()));
auto
results
=
p
.
eval
(
pp
);
auto
result
=
results
[
0
];
auto
result_vec
=
result
.
as_vector
<
float
>
();
std
::
vector
<
float
>
expected_result
=
{
0
,
2
,
8
,
10
,
32
,
34
,
40
,
42
};
EXPECT
(
result_vec
==
expected_result
);
}
TEST_CASE
(
custom_op_with_pre_and_post_subgraph_test
)
{
half_copy_host
hco
;
migraphx
::
register_experimental_custom_op
(
hco
);
stride_two
st
;
migraphx
::
register_experimental_custom_op
(
st
);
migraphx
::
program
p
;
migraphx
::
shape
s
{
migraphx_shape_int32_type
,
{
4
,
3
}};
migraphx
::
shape
trans_shape
{
migraphx_shape_int32_type
,
{
3
,
4
}};
migraphx
::
shape
s
{
migraphx_shape_float_type
,
{
4
,
6
}};
migraphx
::
module
m
=
p
.
get_main_module
();
auto
x
=
m
.
add_parameter
(
"x"
,
s
);
auto
neg
=
m
.
add_instruction
(
migraphx
::
operation
(
"neg"
),
x
);
auto
alloc
=
m
.
add_allocation
(
trans_shape
);
auto
neg_trans
=
m
.
add_instruction
(
migraphx
::
operation
(
"transpose"
,
"{permutation: [1, 0]}"
),
{
neg
});
auto
neg_cont
=
m
.
add_instruction
(
migraphx
::
operation
(
"contiguous"
),
{
neg_trans
});
auto
custom_kernel
=
m
.
add_instruction
(
migraphx
::
operation
(
"simple_custom_op"
),
{
neg_cont
,
alloc
});
auto
relu
=
m
.
add_instruction
(
migraphx
::
operation
(
"relu"
),
custom_kernel
);
m
.
add_return
({
relu
});
// pre-subgraph
auto
neg_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"neg"
),
x
);
auto
trans_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"transpose"
,
"{permutation: [1, 0]}"
),
{
neg_ins
});
auto
cont_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"contiguous"
),
{
trans_ins
});
// custom_op
migraphx
::
shape
trans_shape
{
migraphx_shape_float_type
,
{
6
,
4
}};
auto
alloc
=
m
.
add_allocation
(
trans_shape
);
auto
half_copy_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"half_copy_host"
),
{
cont_ins
,
alloc
});
// post-subgraph
auto
abs_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"abs"
),
{
half_copy_ins
});
// another custom_op
auto
stride_two_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"stride_two"
),
{
abs_ins
});
// post-subgraph
auto
relu_ins
=
m
.
add_instruction
(
migraphx
::
operation
(
"relu"
),
{
stride_two_ins
});
m
.
add_return
({
relu_ins
});
migraphx
::
compile_options
options
;
options
.
set_offload_copy
();
p
.
compile
(
migraphx
::
target
(
"gpu"
),
options
);
migraphx
::
program_parameters
pp
;
std
::
vector
<
int
>
x_data
(
12
,
-
3
);
std
::
vector
<
float
>
x_data
(
s
.
elements
());
std
::
iota
(
x_data
.
begin
(),
x_data
.
end
(),
0
);
pp
.
add
(
"x"
,
migraphx
::
argument
(
s
,
x_data
.
data
()));
auto
results
=
p
.
eval
(
pp
);
auto
result
=
results
[
0
];
auto
result_vec
=
result
.
as_vector
<
in
t
>
();
std
::
vector
<
in
t
>
expected_result
(
12
,
0
)
;
std
::
fill
(
expected_result
.
begin
()
+
6
,
expected_result
.
end
(),
3
);
EXPECT
(
bool
{
result
==
migraphx
::
argument
(
trans_shape
,
expected_result
.
data
())});
auto
results
=
p
.
eval
(
pp
);
auto
result
=
results
[
0
];
auto
result_vec
=
result
.
as_vector
<
floa
t
>
();
std
::
vector
<
floa
t
>
expected_result
=
{
0
,
0
,
0
,
0
,
4
,
16
}
;
EXPECT
(
bool
{
result
==
migraphx
::
argument
(
migraphx
::
shape
{
migraphx_shape_float_type
,
{
3
,
2
}},
expected_result
.
data
())});
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
test/api/test_gpu.cpp
View file @
6b4c86ab
...
...
@@ -25,6 +25,8 @@
#include <hip/hip_runtime_api.h>
#include <migraphx/migraphx.h>
#include <migraphx/migraphx.hpp>
#include <migraphx/manage_ptr.hpp>
#include "test.hpp"
TEST_CASE
(
load_and_run
)
...
...
@@ -44,11 +46,67 @@ TEST_CASE(load_and_run)
{
pp
.
add
(
name
,
migraphx
::
argument
::
generate
(
param_shapes
[
name
]));
}
auto
outputs
=
p
.
eval
(
pp
);
CHECK
(
shapes_before
.
size
()
==
outputs
.
size
());
CHECK
(
bool
{
shapes_before
.
front
()
==
outputs
.
front
().
get_shape
()});
}
using
hip_ptr
=
MIGRAPHX_MANAGE_PTR
(
void
,
hipFree
);
using
stream_ptr
=
MIGRAPHX_MANAGE_PTR
(
hipStream_t
,
hipStreamDestroy
);
stream_ptr
get_stream
()
{
hipStream_t
stream
;
auto
err
=
hipStreamCreateWithFlags
(
&
stream
,
0
);
EXPECT
(
err
==
hipSuccess
);
return
stream_ptr
{
stream
};
}
hip_ptr
get_hip_buffer
(
size_t
size
)
{
void
*
ptr
;
auto
err
=
hipMalloc
(
&
ptr
,
size
);
EXPECT
(
err
==
hipSuccess
);
return
hip_ptr
{
ptr
};
}
TEST_CASE
(
load_and_run_async
)
{
auto
p
=
migraphx
::
parse_onnx
(
"conv_relu_maxpool_test.onnx"
);
auto
shapes_before
=
p
.
get_output_shapes
();
migraphx
::
compile_options
options
;
options
.
set_offload_copy
(
false
);
p
.
compile
(
migraphx
::
target
(
"gpu"
),
options
);
auto
shapes_after
=
p
.
get_output_shapes
();
CHECK
(
shapes_before
.
size
()
==
1
);
CHECK
(
shapes_before
.
size
()
==
shapes_after
.
size
());
CHECK
(
bool
{
shapes_before
.
front
()
==
shapes_after
.
front
()});
migraphx
::
program_parameters
pp
;
auto
param_shapes
=
p
.
get_parameter_shapes
();
stream_ptr
stream
=
get_stream
();
std
::
vector
<
hip_ptr
>
buffs
;
std
::
vector
<
migraphx
::
argument
>
args
;
for
(
auto
&&
name
:
param_shapes
.
names
())
{
args
.
push_back
(
migraphx
::
argument
::
generate
(
param_shapes
[
name
]));
buffs
.
push_back
(
get_hip_buffer
(
args
.
rbegin
()
->
get_shape
().
bytes
()));
auto
err
=
hipMemcpy
(
buffs
.
rbegin
()
->
get
(),
args
.
rbegin
()
->
data
(),
args
.
rbegin
()
->
get_shape
().
bytes
(),
hipMemcpyHostToDevice
);
EXPECT
(
err
==
hipSuccess
);
pp
.
add
(
name
,
migraphx
::
argument
(
args
.
rbegin
()
->
get_shape
(),
buffs
.
rbegin
()
->
get
()));
}
auto
outputs
=
p
.
run_async
(
pp
,
stream
.
get
());
CHECK
(
shapes_before
.
size
()
==
outputs
.
size
());
CHECK
(
bool
{
shapes_before
.
front
()
==
outputs
.
front
().
get_shape
()});
}
TEST_CASE
(
load_and_run_ctx
)
{
auto
p
=
migraphx
::
parse_onnx
(
"conv_relu_maxpool_test.onnx"
);
...
...
@@ -82,10 +140,10 @@ TEST_CASE(if_pl_test)
migraphx
::
program_parameters
pp
;
auto
param_shapes
=
p
.
get_parameter_shapes
();
auto
xs
=
param_shapes
[
"x"
];
std
::
vector
<
float
>
xd
(
xs
.
bytes
()
/
sizeof
(
float
),
1.0
);
std
::
vector
<
float
>
xd
(
xs
.
elements
(
),
1.0
);
pp
.
add
(
"x"
,
migraphx
::
argument
(
xs
,
xd
.
data
()));
auto
ys
=
param_shapes
[
"y"
];
std
::
vector
<
float
>
yd
(
ys
.
bytes
()
/
sizeof
(
float
),
2.0
);
std
::
vector
<
float
>
yd
(
ys
.
elements
(
),
2.0
);
pp
.
add
(
"y"
,
migraphx
::
argument
(
ys
,
yd
.
data
()));
char
ccond
=
cond
;
pp
.
add
(
"cond"
,
migraphx
::
argument
(
param_shapes
[
"cond"
],
&
ccond
));
...
...
test/gpu/mlir.cpp
View file @
6b4c86ab
...
...
@@ -37,10 +37,6 @@
#include <migraphx/functional.hpp>
#include <test.hpp>
using
migraphx
::
trim
;
// m test_gpu_mlir && ./bin/test_gpu_mlir
struct
mlir_gpu_target
:
migraphx
::
gpu
::
target
{
std
::
string
name
()
const
{
return
"mlir"
;
}
...
...
test/gpu/pack_int8_args.cpp
View file @
6b4c86ab
...
...
@@ -30,6 +30,7 @@
#include <migraphx/adjust_allocation.hpp>
#include <migraphx/gpu/pack_int8_args.hpp>
#include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/auto_contiguous.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/replace_allocate.hpp>
...
...
@@ -43,9 +44,8 @@
// Treat some operators as compilable to enable lowering
MIGRAPHX_GPU_TEST_PRECOMPILE
(
"add"
,
"mul"
,
"convert"
)
void
run_passes
(
migraphx
::
module
&
m
)
void
run_passes
(
migraphx
::
module
&
m
,
migraphx
::
gpu
::
context
&
ctx
)
{
auto
ctx
=
migraphx
::
gpu
::
context
{};
migraphx
::
run_passes
(
m
,
{
migraphx
::
auto_contiguous
{},
migraphx
::
gpu
::
lowering
{
&
ctx
,
false
},
...
...
@@ -56,18 +56,6 @@ void run_passes(migraphx::module& m)
migraphx
::
dead_code_elimination
{}});
}
bool
get_int8_x4_format
()
{
bool
int8_x4_format
=
true
;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
auto
ctx
=
migraphx
::
gpu
::
context
{};
rocblas_gemm_flags
flag
;
rocblas_query_int8_layout_flag
(
ctx
.
get_stream
().
get_rocblas
(),
&
flag
);
int8_x4_format
=
(
flag
==
rocblas_gemm_flags_pack_int8x4
);
#endif
return
int8_x4_format
;
}
TEST_CASE
(
quant_dot
)
{
auto
create_module
=
[]
{
...
...
@@ -106,11 +94,13 @@ TEST_CASE(quant_dot)
migraphx
::
make_op
(
"hip::allocate"
,
{{
"shape"
,
migraphx
::
to_value
(
m2_shape
)}}));
packa
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::int8_gemm_pack_a"
),
l2
,
alloc
);
}
auto
gemm
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
}}),
l1
,
packa
,
gemm_alloc
);
auto
gemm
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
},
{
"compute_fp32"
,
migraphx
::
gpu
::
get_compute_fp32_flag
()}}),
l1
,
packa
,
gemm_alloc
);
auto
beta_broadcast
=
m
.
add_instruction
(
migraphx
::
make_op
(
"multibroadcast"
,
{{
"out_lens"
,
m3_shape
.
lens
()}}),
beta
);
...
...
@@ -127,11 +117,12 @@ TEST_CASE(quant_dot)
return
m
;
};
auto
m1
=
create_module
();
run_passes
(
m1
);
auto
m1
=
create_module
();
auto
ctx
=
migraphx
::
gpu
::
context
{};
run_passes
(
m1
,
ctx
);
bool
flag
=
get_int8_x4_format
();
auto
m2
=
create_optimized_int8_x4
(
flag
);
bool
int8_x4
=
migraphx
::
gpu
::
get_int8_x4_format
(
ctx
);
auto
m2
=
create_optimized_int8_x4
(
int8_x4
);
EXPECT
(
m1
==
m2
);
}
...
...
@@ -216,21 +207,24 @@ TEST_CASE(quant_dot_trans)
packb
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::int8_gemm_pack_a"
),
contb
,
allocpb
);
}
auto
gemm
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
}}),
tl1_alpha_int8
,
packb
,
output
);
auto
gemm
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
},
{
"compute_fp32"
,
migraphx
::
gpu
::
get_compute_fp32_flag
()}}),
tl1_alpha_int8
,
packb
,
output
);
m
.
add_return
({
gemm
});
return
m
;
};
auto
m1
=
create_module
();
bool
flag
=
get_int8_x4_format
()
;
auto
m2
=
create_optimized_int8_x4
(
flag
);
auto
m1
=
create_module
();
auto
ctx
=
migraphx
::
gpu
::
context
{}
;
run_passes
(
m1
,
ctx
);
run_passes
(
m1
);
bool
int8_x4
=
migraphx
::
gpu
::
get_int8_x4_format
(
ctx
);
auto
m2
=
create_optimized_int8_x4
(
int8_x4
);
EXPECT
(
m1
==
m2
);
}
...
...
@@ -297,11 +291,13 @@ TEST_CASE(quant_dot_pad)
packa
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::int8_gemm_pack_a"
),
pl2
,
alloc
);
}
auto
gemm
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
}}),
pl1
,
packa
,
gemm_alloc
);
auto
gemm
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
},
{
"compute_fp32"
,
migraphx
::
gpu
::
get_compute_fp32_flag
()}}),
pl1
,
packa
,
gemm_alloc
);
auto
beta_broadcast
=
m
.
add_instruction
(
migraphx
::
make_op
(
"multibroadcast"
,
{{
"out_lens"
,
s3
.
lens
()}}),
beta
);
...
...
@@ -317,11 +313,12 @@ TEST_CASE(quant_dot_pad)
return
m
;
};
auto
m1
=
create_module
();
bool
flag
=
get_int8_x4_format
()
;
auto
m2
=
create_optimized_int8_x4
(
flag
);
auto
m1
=
create_module
();
auto
ctx
=
migraphx
::
gpu
::
context
{}
;
run_passes
(
m1
,
ctx
);
run_passes
(
m1
);
bool
int8_x4
=
migraphx
::
gpu
::
get_int8_x4_format
(
ctx
);
auto
m2
=
create_optimized_int8_x4
(
int8_x4
);
EXPECT
(
m1
==
m2
);
}
...
...
@@ -444,17 +441,23 @@ TEST_CASE(quant_dot_trans_pad)
}
auto
gemm
=
m
.
add_instruction
(
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
}}),
pa
,
packb
,
output
);
migraphx
::
make_op
(
"gpu::quant_gemm"
,
{{
"int8_x4_format"
,
int8_x4
},
{
"compute_fp32"
,
migraphx
::
gpu
::
get_compute_fp32_flag
()}}),
pa
,
packb
,
output
);
m
.
add_return
({
gemm
});
return
m
;
};
auto
m1
=
create_module
();
bool
flag
=
get_int8_x4_format
()
;
auto
m2
=
create_optimized_int8_x4
(
flag
);
auto
m1
=
create_module
();
auto
ctx
=
migraphx
::
gpu
::
context
{}
;
run_passes
(
m1
,
ctx
);
run_passes
(
m1
);
bool
int8_x4
=
migraphx
::
gpu
::
get_int8_x4_format
(
ctx
);
auto
m2
=
create_optimized_int8_x4
(
int8_x4
);
EXPECT
(
m1
==
m2
);
}
...
...
test/gpu/stream_sync.cpp
0 → 100644
View file @
6b4c86ab
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <iostream>
#include <vector>
#include <migraphx/gpu/context.hpp>
#include <migraphx/context.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/kernel.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/par_for.hpp>
#include <migraphx/program.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/module.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/gpu/target.hpp>
#include "test.hpp"
using
hip_stream_ptr
=
MIGRAPHX_MANAGE_PTR
(
hipStream_t
,
hipStreamDestroy
);
constexpr
uint32_t
stream_sync_test_val
=
1337
;
// NOLINTNEXTLINE
const
std
::
string
compare_numbers
=
R"__migraphx__(
#include <hip/hip_runtime.h>
extern "C" {
__global__ void compare(float* data)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (data[i] != 1337)
{
abort();
}
}
}
int main() {}
)__migraphx__"
;
migraphx
::
src_file
make_src_file
(
const
std
::
string
&
name
,
const
std
::
string
&
content
)
{
return
{
name
,
std
::
make_pair
(
content
.
data
(),
content
.
data
()
+
content
.
size
())};
}
hip_stream_ptr
get_stream
()
{
hipStream_t
stream
;
auto
status
=
hipStreamCreate
(
&
stream
);
if
(
status
!=
hipSuccess
)
{
MIGRAPHX_THROW
(
"Failed to get stream"
);
}
return
hip_stream_ptr
{
stream
};
}
TEST_CASE
(
test_stream_sync_compare_kernel
)
{
auto
binaries
=
migraphx
::
gpu
::
compile_hip_src
(
{
make_src_file
(
"check_stuff.cpp"
,
compare_numbers
)},
""
,
migraphx
::
gpu
::
get_device_name
());
EXPECT
(
binaries
.
size
()
==
1
);
migraphx
::
gpu
::
kernel
k1
{
binaries
.
front
(),
"compare"
};
auto
input
=
migraphx
::
fill_argument
({
migraphx
::
shape
::
float_type
,
{
128
}},
stream_sync_test_val
);
auto
ginput
=
migraphx
::
gpu
::
to_gpu
(
input
);
hip_stream_ptr
pstream
=
get_stream
();
k1
.
launch
(
pstream
.
get
(),
input
.
get_shape
().
elements
(),
1024
)(
ginput
.
cast
<
float
>
());
auto
output
=
migraphx
::
gpu
::
from_gpu
(
ginput
);
EXPECT
(
output
==
input
);
}
TEST_CASE
(
test_stream_sync
)
{
auto
binaries
=
migraphx
::
gpu
::
compile_hip_src
(
{
make_src_file
(
"check_stuff.cpp"
,
compare_numbers
)},
""
,
migraphx
::
gpu
::
get_device_name
());
EXPECT
(
binaries
.
size
()
==
1
);
migraphx
::
gpu
::
kernel
k1
{
binaries
.
front
(),
"compare"
};
const
unsigned
int
m
=
128
;
const
unsigned
int
k
=
8192
;
// Setup empty GPU memory buffer
migraphx
::
shape
input_shape
{
migraphx
::
shape
::
float_type
,
{
m
,
k
}};
migraphx
::
shape
output_shape
{
migraphx
::
shape
::
float_type
,
{
m
,
m
}};
auto
input
=
migraphx
::
fill_argument
(
input_shape
,
0
);
auto
ginput
=
migraphx
::
gpu
::
to_gpu
(
input
);
auto
output
=
migraphx
::
fill_argument
(
output_shape
,
0
);
auto
goutput
=
migraphx
::
gpu
::
to_gpu
(
output
);
hip_stream_ptr
pstream
=
get_stream
();
migraphx
::
program
p
;
auto
*
mm
=
p
.
get_main_module
();
auto
x
=
mm
->
add_parameter
(
"x"
,
migraphx
::
shape
{
migraphx
::
shape
::
float_type
,
{
m
,
k
}});
auto
y
=
mm
->
add_literal
(
migraphx
::
generate_literal
(
migraphx
::
shape
{
migraphx
::
shape
::
float_type
,
{
k
,
m
}}));
std
::
vector
<
float
>
data
(
m
*
m
,
stream_sync_test_val
);
auto
test_val
=
mm
->
add_literal
(
output_shape
,
data
);
auto
mult_out
=
mm
->
add_instruction
(
migraphx
::
make_op
(
"dot"
),
x
,
y
);
mm
->
add_instruction
(
migraphx
::
make_op
(
"add"
),
mult_out
,
test_val
);
p
.
compile
(
migraphx
::
gpu
::
target
{});
// Run network and then verify with kernel
auto
args
=
p
.
eval
({{
"x"
,
ginput
},
{
"output"
,
goutput
}},
{
pstream
.
get
(),
true
});
k1
.
launch
(
pstream
.
get
(),
m
*
m
,
1024
)(
goutput
.
cast
<
float
>
());
output
=
migraphx
::
gpu
::
from_gpu
(
goutput
);
EXPECT
(
output
!=
input
);
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
test/py/CMakeLists.txt
View file @
6b4c86ab
...
...
@@ -56,4 +56,5 @@ add_py_test(gpu_offload test_gpu_offload.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
add_py_test
(
gpu test_gpu.py WORKING_DIRECTORY
${
TEST_ONNX_DIR
}
)
add_py_test
(
array test_array.py WORKING_DIRECTORY
${
TEST_ONNX_DIR
}
)
add_py_test
(
backend onnx_backend_test.py WORKING_DIRECTORY
${
TEST_ONNX_DIR
}
)
add_py_test
(
gpu_async test_gpu_async.py WORKING_DIRECTORY
${
TEST_ONNX_DIR
}
)
endif
()
test/py/test_gpu_async.py
0 → 100644
View file @
6b4c86ab
#####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
import
migraphx
import
ctypes
def
test_conv_relu
():
hip
=
ctypes
.
cdll
.
LoadLibrary
(
"libamdhip64.so"
)
p
=
migraphx
.
parse_onnx
(
"conv_relu_maxpool_test.onnx"
)
print
(
p
)
print
(
"Compiling ..."
)
# Need to have offload_copy = False to avoid syncs() back to the host device
p
.
compile
(
migraphx
.
get_target
(
"gpu"
),
offload_copy
=
False
)
print
(
p
)
params
=
{}
# Using default value in api for hipSuccess which is always 0
hipSuccess
=
ctypes
.
c_long
(
0
)
# Alloc a stream
stream
=
ctypes
.
c_void_p
()
err
=
ctypes
.
c_long
(
hip
.
hipStreamCreateWithFlags
(
ctypes
.
byref
(
stream
),
ctypes
.
c_uint
(
0
)))
if
err
.
value
!=
hipSuccess
.
value
:
print
(
"FAILED hipStreamCreate"
)
return
err
# Use to_gpu to push generated argument to the GPU before we perform a run
for
key
,
value
in
p
.
get_parameter_shapes
().
items
():
params
[
key
]
=
migraphx
.
to_gpu
(
migraphx
.
generate_argument
(
value
))
result
=
migraphx
.
from_gpu
(
p
.
run_async
(
params
,
stream
.
value
,
"ihipStream_t"
)[
-
1
])
# Wait for all commands in stream to complete
err
=
ctypes
.
c_long
(
hip
.
hipStreamSynchronize
(
stream
))
if
err
.
value
!=
hipSuccess
.
value
:
print
(
"FAILED: hipStreamSyncronize"
)
return
err
# Cleanup Stream
err
=
ctypes
.
c_long
(
hip
.
hipStreamDestroy
(
stream
))
if
err
.
value
!=
hipSuccess
.
value
:
print
(
"FAILED: hipStreamDestroy"
)
return
err
print
(
result
)
test_conv_relu
()
test/verify/test_softmax_large3.cpp
0 → 100644
View file @
6b4c86ab
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "verify_program.hpp"
#include <migraphx/program.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/common.hpp>
struct
test_softmax_large3
:
verify_program
<
test_softmax_large3
>
{
migraphx
::
program
create_program
()
const
{
migraphx
::
program
p
;
auto
*
mm
=
p
.
get_main_module
();
auto
x
=
mm
->
add_parameter
(
"x"
,
migraphx
::
shape
{
migraphx
::
shape
::
float_type
,
{
2
,
4
}});
auto
large
=
mm
->
add_literal
({
migraphx
::
shape
{
migraphx
::
shape
::
float_type
},
{
100
}});
auto
add
=
migraphx
::
add_common_op
(
*
mm
,
migraphx
::
make_op
(
"mul"
),
{
x
,
large
});
mm
->
add_instruction
(
migraphx
::
make_op
(
"softmax"
,
{{
"axis"
,
-
1
}}),
add
);
return
p
;
}
};
tools/accuracy/accuracy_checker.py
View file @
6b4c86ab
...
...
@@ -116,6 +116,9 @@ def main():
model
=
migraphx
.
parse_onnx
(
model_name
,
default_dim_value
=
batch
)
if
args
.
verbose
:
print
(
model
)
model
.
compile
(
migraphx
.
get_target
(
'gpu'
),
offload_copy
=
False
)
params
=
{}
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment