Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
5af9aac0
Commit
5af9aac0
authored
Feb 21, 2023
by
charlie
Browse files
Merge branch 'dyn_batch_pass' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_test_runner
parents
7b2516e0
05e81ed3
Changes
147
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
605 additions
and
171 deletions
+605
-171
src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
...targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
+13
-19
src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+8
-5
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+71
-12
src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
...argets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+9
-10
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+2
-3
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
+22
-3
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+292
-51
src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+0
-8
src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+9
-7
src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
...gets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+24
-0
src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+39
-2
src/targets/gpu/lowering.cpp
src/targets/gpu/lowering.cpp
+29
-2
src/targets/gpu/prefuse_ops.cpp
src/targets/gpu/prefuse_ops.cpp
+13
-6
src/targets/gpu/target.cpp
src/targets/gpu/target.cpp
+7
-10
test/CMakeLists.txt
test/CMakeLists.txt
+38
-26
test/api/test_gpu.cpp
test/api/test_gpu.cpp
+1
-0
test/api/test_save_load.cpp
test/api/test_save_load.cpp
+0
-1
test/gpu/jit.cpp
test/gpu/jit.cpp
+5
-3
test/memory_coloring_test.cpp
test/memory_coloring_test.cpp
+22
-3
test/onnx/.onnxrt-commit
test/onnx/.onnxrt-commit
+1
-0
No files found.
src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
View file @
5af9aac0
...
@@ -26,7 +26,7 @@
...
@@ -26,7 +26,7 @@
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/algorithm.hpp>
#include <migraphx/kernels/algorithm.hpp>
#include <migraphx/kernels/ops.hpp>
namespace
migraphx
{
namespace
migraphx
{
template
<
class
T
>
template
<
class
T
>
...
@@ -53,22 +53,16 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
...
@@ -53,22 +53,16 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
auto
indices_shape_lens
=
indices_shape
.
lens
;
auto
indices_shape_lens
=
indices_shape
.
lens
;
auto
data_shape_lens
=
data_shape
.
lens
;
auto
data_shape_lens
=
data_shape
.
lens
;
auto
num_slice_dims
=
indices_shape_lens
.
back
();
auto
num_slice_dims
=
indices_shape_lens
.
back
();
std
::
size_t
num_slices
=
accumulate
(
indices_shape_lens
.
begin
(),
std
::
size_t
num_slices
=
indices_shape_lens
.
end
()
-
1
,
accumulate
(
indices_shape_lens
.
begin
(),
indices_shape_lens
.
end
()
-
1
,
1
,
op
::
product
{});
1
,
std
::
multiplies
<
std
::
size_t
>
());
std
::
size_t
slice_size
=
accumulate
(
data_shape_lens
.
begin
()
+
num_slice_dims
+
batch_dims
,
std
::
size_t
slice_size
=
accumulate
(
data_shape_lens
.
begin
()
+
num_slice_dims
+
batch_dims
,
data_shape_lens
.
end
(),
data_shape_lens
.
end
(),
1
,
1
,
std
::
multiplies
<
std
::
size_t
>
());
op
::
product
{});
const
std
::
size_t
num_batches
=
accumulate
(
data_shape_lens
.
begin
(),
const
std
::
size_t
num_batches
=
data_shape_lens
.
begin
()
+
batch_dims
,
accumulate
(
data_shape_lens
.
begin
(),
data_shape_lens
.
begin
()
+
batch_dims
,
1
,
op
::
product
{});
1
,
const
std
::
size_t
data_batch_stride
=
std
::
multiplies
<
std
::
size_t
>
());
accumulate
(
data_shape_lens
.
begin
()
+
batch_dims
,
data_shape_lens
.
end
(),
1
,
op
::
product
{});
const
std
::
size_t
data_batch_stride
=
accumulate
(
data_shape_lens
.
begin
()
+
batch_dims
,
data_shape_lens
.
end
(),
1
,
std
::
multiplies
<
std
::
size_t
>
());
const
auto
num_slices_per_batch
=
num_slices
/
num_batches
;
const
auto
num_slices_per_batch
=
num_slices
/
num_batches
;
ind
.
global_stride
(
output_shape
.
elements
(),
[
&
](
auto
i
)
{
ind
.
global_stride
(
output_shape
.
elements
(),
[
&
](
auto
i
)
{
...
@@ -83,7 +77,7 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
...
@@ -83,7 +77,7 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
int64_t
index
=
slice_indices
[
idx
];
int64_t
index
=
slice_indices
[
idx
];
const
std
::
size_t
input_dim_idx
=
batch_dims
+
idx
;
const
std
::
size_t
input_dim_idx
=
batch_dims
+
idx
;
const
auto
input_dim
=
data_shape_lens
[
input_dim_idx
];
const
auto
input_dim
=
data_shape_lens
[
input_dim_idx
];
assert
(
index
>=
-
static_cast
<
int64_t
>
(
input_dim
)
and
MIGRAPHX_ASSERT
(
index
>=
-
static_cast
<
int64_t
>
(
input_dim
)
and
index
<
static_cast
<
int64_t
>
(
input_dim
));
index
<
static_cast
<
int64_t
>
(
input_dim
));
if
(
index
<
0
)
if
(
index
<
0
)
index
+=
input_dim
;
index
+=
input_dim
;
...
@@ -91,7 +85,7 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
...
@@ -91,7 +85,7 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
accumulate
(
data_shape_lens
.
begin
()
+
batch_dims
+
idx
+
1
,
accumulate
(
data_shape_lens
.
begin
()
+
batch_dims
+
idx
+
1
,
data_shape_lens
.
begin
()
+
batch_dims
+
num_slice_dims
,
data_shape_lens
.
begin
()
+
batch_dims
+
num_slice_dims
,
slice_size
,
slice_size
,
std
::
multiplies
<
std
::
size_t
>
()
);
op
::
product
{}
);
relative_slice_offset
+=
index
*
size_from_slice_dims
;
relative_slice_offset
+=
index
*
size_from_slice_dims
;
}
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
View file @
5af9aac0
...
@@ -24,11 +24,14 @@
...
@@ -24,11 +24,14 @@
#ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
#define MIGRAPHX_GUARD_KERNELS_HIP_HPP
#define MIGRAPHX_GUARD_KERNELS_HIP_HPP
// Workaround macro redefinition issue with clang tidy
#ifndef MIGRAPHX_USE_HIPRTC
#if defined(__HIP_PLATFORM_HCC__) && defined(MIGRAPHX_USE_CLANG_TIDY)
#undef __HIP_PLATFORM_HCC__ // NOLINT
#endif
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/math_functions.h>
#include <hip/hip_math_constants.h>
#elif defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS)
#include <hip/hip_common.h>
#include <hip/hip_math_constants.h>
#endif
#endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
#endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
View file @
5af9aac0
...
@@ -29,6 +29,7 @@
...
@@ -29,6 +29,7 @@
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <migraphx/kernels/debug.hpp>
#include <migraphx/kernels/debug.hpp>
#include <migraphx/kernels/functional.hpp>
namespace
migraphx
{
namespace
migraphx
{
...
@@ -135,42 +136,100 @@ struct index
...
@@ -135,42 +136,100 @@ struct index
return
(
n
-
_c
<
1
>
)
/
stride
+
_c
<
1
>
;
return
(
n
-
_c
<
1
>
)
/
stride
+
_c
<
1
>
;
}
}
template
<
class
N
>
constexpr
auto
max_global_stride_iterations
(
N
n
)
const
{
return
max_stride_iterations
(
n
,
nglobal
());
}
template
<
class
N
>
constexpr
auto
max_local_stride_iterations
(
N
n
)
const
{
return
max_stride_iterations
(
n
,
nlocal
());
}
template
<
class
F
,
class
I
,
class
D
>
static
constexpr
auto
invoke_loop
(
F
f
,
I
i
,
D
d
)
->
decltype
(
f
(
i
,
d
))
{
return
f
(
i
,
d
);
}
template
<
class
F
,
class
I
,
class
D
>
static
constexpr
auto
invoke_loop
(
F
f
,
I
i
,
D
)
->
decltype
(
f
(
i
))
{
return
f
(
i
);
}
template
<
class
F
,
class
N
,
class
Stride
>
static
constexpr
void
for_stride_loop_unroll
(
index_int
start
,
N
n
,
Stride
stride
,
F
f
)
{
sequence
(
max_stride_iterations
(
n
,
stride
),
[
&
](
auto
...
ks
)
{
fold
([
&
](
auto
d
,
auto
k
)
{
auto
i
=
start
+
stride
*
k
;
if
(
i
<
n
)
invoke_loop
(
f
,
i
,
d
);
return
d
+
_c
<
1
>
;
})(
_c
<
0
>
,
ks
...);
});
}
template
<
class
F
,
class
N
,
class
Stride
>
template
<
class
F
,
class
N
,
class
Stride
>
static
constexpr
void
for_stride_loop
(
index_int
start
,
N
n
,
Stride
stride
,
F
f
)
{
index_int
k
=
0
;
for
(
index_int
i
=
start
;
i
<
n
;
i
+=
stride
)
{
invoke_loop
(
f
,
i
,
k
);
k
++
;
}
}
template
<
bool
Unroll
,
class
F
,
class
N
,
class
Stride
>
static
constexpr
void
for_stride
(
index_int
start
,
N
n
,
Stride
stride
,
F
f
)
static
constexpr
void
for_stride
(
index_int
start
,
N
n
,
Stride
stride
,
F
f
)
{
{
MIGRAPHX_ASSERT
(
start
<
stride
);
MIGRAPHX_ASSERT
(
start
<
stride
);
if
constexpr
(
not
is_integral
<
N
>
{}
and
not
is_integral
<
Stride
>
{}
and
if
constexpr
(
not
is_integral
<
N
>
{}
and
not
is_integral
<
Stride
>
{})
max_stride_iterations
(
n
,
stride
)
==
1
)
{
if
constexpr
(
max_stride_iterations
(
n
,
stride
)
==
1
)
{
{
if
constexpr
(
stride
>
n
)
if
constexpr
(
stride
>
n
)
{
{
if
(
start
<
n
)
if
(
start
<
n
)
f
(
start
);
invoke_loop
(
f
,
start
,
_c
<
0
>
);
}
}
else
else
{
{
f
(
start
);
invoke_loop
(
f
,
start
,
_c
<
0
>
);
}
}
}
}
else
else
if
constexpr
(
Unroll
)
{
{
for
(
index_int
i
=
start
;
i
<
n
;
i
+=
stride
)
MIGRAPHX_STATIC_ASSERT_FOR
(
max_stride_iterations
(
n
,
stride
)
<
256
)
{
for_stride_loop_unroll
(
start
,
n
,
stride
,
f
);
}
}
else
{
{
f
(
i
);
for_stride_loop
(
start
,
n
,
stride
,
f
);
}
}
}
else
{
for_stride_loop
(
start
,
n
,
stride
,
f
);
}
}
}
}
template
<
class
F
,
class
N
>
template
<
class
F
,
class
N
>
__device__
void
global_stride
(
N
n
,
F
f
)
const
__device__
void
global_stride
(
N
n
,
F
f
)
const
{
{
for_stride
(
global
,
n
,
nglobal
(),
f
);
for_stride
<
false
>
(
global
,
n
,
nglobal
(),
f
);
}
}
template
<
class
F
,
class
N
>
template
<
class
F
,
class
N
>
__device__
void
local_stride
(
N
n
,
F
f
)
const
__device__
void
local_stride
(
N
n
,
F
f
)
const
{
{
for_stride
(
local
,
n
,
nlocal
(),
f
);
for_stride
<
true
>
(
local
,
n
,
nlocal
(),
f
);
}
}
};
};
...
...
src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
View file @
5af9aac0
...
@@ -46,28 +46,27 @@ template <index_int Axis,
...
@@ -46,28 +46,27 @@ template <index_int Axis,
__device__
void
generic_binary_layernorm
(
__device__
void
generic_binary_layernorm
(
F
compute
,
BinOp
op
,
float
eps
,
Output
output
,
Input1
input1
,
Input2
input2
,
Inputs
...
inputs
)
F
compute
,
BinOp
op
,
float
eps
,
Output
output
,
Input1
input1
,
Input2
input2
,
Inputs
...
inputs
)
{
{
using
block
=
reduce
::
auto_block
<
reduce
::
reduce_elements_with_axis
<
Input1
,
Axis
>
()
>
;
using
reduce_output
=
reduce
::
with_axis
<
Input1
,
Axis
>
;
using
reduce_output
=
reduce
::
with_axis
<
Input1
,
Axis
>
;
reduce
::
block
::
run
<
reduce_output
>
([
&
](
auto
,
auto
r
)
{
block
::
template
run
<
reduce_output
>([
&
](
auto
,
auto
r
)
{
auto
input
=
r
.
inner
([
&
](
auto
x1
,
auto
x2
)
{
return
op
(
x1
,
x2
);
})(
input1
,
input2
);
using
value_type
=
typename
Input1
::
type
;
using
value_type
=
typename
Input1
::
type
;
constexpr
auto
relements
=
r
.
template
elements
<
Input1
>();
constexpr
auto
relements
=
r
.
template
elements
<
Input1
>();
auto
means
=
auto
means
=
r
.
reduce
(
op
::
sum
{},
make_array
<
vec_type
<
value_type
>>
(
0
,
0
),
[
&
](
auto
x
)
{
r
.
reduce
(
op
::
sum
{},
make_array
<
vec_type
<
value_type
>>
(
0
,
0
),
[
&
](
auto
x1
,
auto
x2
)
{
auto
x
=
op
(
x1
,
x2
);
return
make_array
(
x
,
x
*
x
)
*
vec_type
<
value_type
>
{
1.0
/
relements
};
return
make_array
(
x
,
x
*
x
)
*
vec_type
<
value_type
>
{
1.0
/
relements
};
})(
input
1
,
input2
);
})(
input
);
auto
mean_x
=
means
[
0
];
auto
mean_x
=
means
[
0
];
auto
mean_x2
=
means
[
1
];
auto
mean_x2
=
means
[
1
];
auto
variance
=
mean_x2
-
(
mean_x
*
mean_x
);
auto
variance
=
mean_x2
-
(
mean_x
*
mean_x
);
value_type
eps_val
=
eps
;
// implicit conversion for eps
value_type
eps_val
=
eps
;
// implicit conversion for eps
r
.
inner
([
&
](
auto
&
y
,
auto
x1
,
auto
x2
,
auto
...
xs
)
{
r
.
inner
([
&
](
auto
&
y
,
auto
x
,
auto
...
xs
)
{
auto
x
=
op
(
x1
,
x2
);
auto
m
=
x
-
mean_x
;
auto
m
=
x
-
mean_x
;
// m * rsqrt(mean(m ^ 2) + epsilon)
// m * rsqrt(mean(m ^ 2) + epsilon)
y
=
compute
(
m
*
rsqrt
(
variance
+
eps_val
),
xs
...);
y
=
compute
(
m
*
rsqrt
(
variance
+
eps_val
),
xs
...);
})(
output
,
input
1
,
input2
,
inputs
...);
})(
output
,
input
,
inputs
...);
});
});
}
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
View file @
5af9aac0
...
@@ -28,8 +28,7 @@
...
@@ -28,8 +28,7 @@
#include <migraphx/kernels/vec.hpp>
#include <migraphx/kernels/vec.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <hip/hip_fp16.h>
#include <migraphx/kernels/hip.hpp>
#include <hip/math_functions.h>
namespace
migraphx
{
namespace
migraphx
{
...
@@ -222,7 +221,7 @@ constexpr auto min(const T& a, const U& b)
...
@@ -222,7 +221,7 @@ constexpr auto min(const T& a, const U& b)
template
<
class
T
,
MIGRAPHX_REQUIRES
(
is_same
<
vec_type
<
T
>,
half
>
{})
>
template
<
class
T
,
MIGRAPHX_REQUIRES
(
is_same
<
vec_type
<
T
>,
half
>
{})
>
constexpr
T
sin
(
T
x
)
constexpr
T
sin
(
T
x
)
{
{
constexpr
const
T
shift
=
M_PI_2
;
constexpr
const
T
shift
=
HIP_PIO2_F
;
return
migraphx
::
cos
(
shift
-
x
);
return
migraphx
::
cos
(
shift
-
x
);
}
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
View file @
5af9aac0
...
@@ -56,13 +56,32 @@ struct id
...
@@ -56,13 +56,32 @@ struct id
}
}
};
};
template
<
class
T
>
struct
convert_to
{
template
<
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
U
x
)
const
{
return
convert
<
T
>
(
x
);
}
};
template
<
index_int
N
>
struct
mean
struct
mean
{
{
index_int
item_num
=
1
;
template
<
class
T
>
template
<
class
T
>
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
)
const
MIGRAPHX_DEVICE_CONSTEXPR
T
operator
()(
T
x
)
const
{
using
type
=
vec_type
<
T
>
;
if
constexpr
(
is_floating_point
<
type
>
{})
{
{
return
x
/
static_cast
<
T
>
(
item_num
);
constexpr
type
d
=
1.0
/
N
;
return
x
*
d
;
}
else
{
return
x
/
static_cast
<
type
>
(
N
);
}
}
}
};
};
...
...
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
View file @
5af9aac0
...
@@ -103,10 +103,10 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
...
@@ -103,10 +103,10 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
#else
#else
constexpr
index_int
lanes_per_thread
=
64
;
constexpr
index_int
lanes_per_thread
=
64
;
#endif
#endif
using
type
=
decltype
(
f
(
0
));
using
type
=
decltype
(
index
::
invoke_loop
(
f
,
0
,
_c
<
0
>
));
__shared__
type
buffer
[
idx
.
max_nlocal
()
/
lanes_per_thread
];
__shared__
type
buffer
[
idx
.
max_nlocal
()
/
lanes_per_thread
];
type
x
=
init
;
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
idx
.
local_stride
(
n
,
[
&
](
auto
i
,
auto
d
)
{
x
=
op
(
x
,
index
::
invoke_loop
(
f
,
i
,
d
));
});
dpp_reduce
(
x
,
op
);
dpp_reduce
(
x
,
op
);
const
auto
ldsidx
=
idx
.
local
/
lanes_per_thread
;
const
auto
ldsidx
=
idx
.
local
/
lanes_per_thread
;
...
@@ -128,10 +128,10 @@ template <class Op, class T, class Index, class F>
...
@@ -128,10 +128,10 @@ template <class Op, class T, class Index, class F>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
Index
n
,
F
f
)
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
Index
n
,
F
f
)
{
{
MIGRAPHX_ASSERT
(
idx
.
max_nlocal
()
==
idx
.
nlocal
());
MIGRAPHX_ASSERT
(
idx
.
max_nlocal
()
==
idx
.
nlocal
());
using
type
=
decltype
(
f
(
0
));
using
type
=
decltype
(
index
::
invoke_loop
(
f
,
0
,
_c
<
0
>
));
__shared__
type
buffer
[
idx
.
max_nlocal
()];
__shared__
type
buffer
[
idx
.
max_nlocal
()];
type
x
=
init
;
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
idx
.
local_stride
(
n
,
[
&
](
auto
i
,
auto
d
)
{
x
=
op
(
x
,
index
::
invoke_loop
(
f
,
i
,
d
));
});
buffer
[
idx
.
local
]
=
x
;
buffer
[
idx
.
local
]
=
x
;
__syncthreads
();
__syncthreads
();
...
@@ -167,6 +167,25 @@ constexpr auto reduce_slice(Input input, T i)
...
@@ -167,6 +167,25 @@ constexpr auto reduce_slice(Input input, T i)
namespace
reduce
{
namespace
reduce
{
struct
inner_storage_tag
{
};
template
<
class
T
>
using
is_inner_storage
=
is_base_of
<
inner_storage_tag
,
remove_cv_t
<
remove_reference_t
<
T
>>>
;
template
<
class
R
,
class
F
>
struct
storage_access
:
F
{
using
type
=
R
;
};
template
<
class
R
,
class
F
>
constexpr
storage_access
<
R
,
F
>
make_storage_access
(
F
f
)
{
return
{{
f
}};
}
template
<
class
Slicer
,
class
F
>
template
<
class
Slicer
,
class
F
>
constexpr
auto
sliced
(
Slicer
slicer
,
F
f
)
constexpr
auto
sliced
(
Slicer
slicer
,
F
f
)
{
{
...
@@ -191,42 +210,100 @@ constexpr auto compute_reduce_axis()
...
@@ -191,42 +210,100 @@ constexpr auto compute_reduce_axis()
template
<
class
Input
,
index_int
Axis
>
template
<
class
Input
,
index_int
Axis
>
using
with_axis
=
decltype
(
compute_reduce_axis
<
Input
,
Axis
>
());
using
with_axis
=
decltype
(
compute_reduce_axis
<
Input
,
Axis
>
());
struct
block
template
<
class
Derived
>
struct
reducer_base
{
{
template
<
class
Slicer
>
template
<
class
T
>
struct
reducer
__device__
auto
make_inner_slice
(
T
x
)
const
{
{
index
idx
;
if
constexpr
(
is_inner_storage
<
T
>
{})
Slicer
slice
;
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
x
;
return
block_reduce
(
idx
,
op
,
init
,
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
}
return
vec_reduce
(
read
(
x
[
j
],
xs
[
j
]...),
op
);
else
});
{
auto
&&
derived
=
static_cast
<
const
Derived
&>
(
*
this
);
auto
t
=
derived
.
slice
(
x
);
return
make_storage_access
<
typename
decltype
(
t
)
::
type
>
([
=
](
auto
i
,
auto
...)
->
auto
&
{
return
t
[
i
];
});
});
}
}
}
template
<
class
T
,
class
...
Ts
>
constexpr
auto
get_size
(
T
&&
x
,
[[
maybe_unused
]]
Ts
&&
...
xs
)
const
{
MIGRAPHX_ASSERT
(
get_size
(
x
)
==
get_size
(
xs
...));
return
get_size
(
x
);
}
template
<
class
T
,
class
...
Ts
>
constexpr
auto
get_size
(
T
&&
x
)
const
{
if
constexpr
(
is_inner_storage
<
T
>
{})
{
return
x
.
rsize
();
}
else
{
auto
&&
derived
=
static_cast
<
const
Derived
&>
(
*
this
);
auto
t
=
derived
.
slice
(
x
);
return
t
.
size
();
}
}
template
<
class
F
>
template
<
class
F
>
__device__
void
outer
(
F
f
)
const
__device__
auto
inner_sliced
(
F
f
)
const
{
{
if
(
idx
.
local
==
0
)
return
[
=
](
auto
&&
...
xs
)
{
return
f
(
get_size
(
xs
...),
make_inner_slice
(
xs
)...);
};
f
();
}
}
template
<
class
T
>
static
__device__
typename
T
::
type
&
decl_inner_storage
(
const
T
&
);
template
<
class
F
>
template
<
class
F
>
__device__
auto
inner
(
F
f
)
const
__device__
auto
inner
(
F
f
)
const
{
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
this
->
inner_sliced
([
=
](
auto
n
,
auto
&&
...
xs
)
{
idx
.
local_stride
(
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
f
(
x
[
j
],
xs
[
j
]...);
});
using
result_type
=
decltype
(
f
(
decl_inner_storage
(
xs
)...));
auto
&&
derived
=
static_cast
<
const
Derived
&>
(
*
this
);
if
constexpr
(
is_void
<
result_type
>
{})
{
derived
.
inner_void_impl
(
f
,
n
,
xs
...);
}
else
{
return
derived
.
template
inner_impl
<
result_type
>(
f
,
n
,
xs
...);
}
});
});
}
}
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
return
this
->
inner_sliced
([
=
](
auto
n
,
auto
&&
...
xs
)
{
auto
&&
derived
=
static_cast
<
const
Derived
&>
(
*
this
);
return
derived
.
reduce_impl
(
op
,
init
,
read
,
n
,
xs
...);
});
}
template
<
class
Op
,
class
T
>
__device__
auto
reduce
(
Op
op
,
T
init
)
const
{
return
this
->
reduce
(
op
,
init
,
op
::
id
{});
}
template
<
class
F
>
__device__
void
outer
(
F
f
)
const
{
f
();
}
template
<
class
Input
>
template
<
class
Input
>
constexpr
auto
elements
()
const
constexpr
auto
elements
()
const
{
{
using
reduce_type
=
decltype
(
slice
(
Input
{}));
auto
&&
derived
=
static_cast
<
const
Derived
&>
(
*
this
);
using
reduce_type
=
decltype
(
derived
.
slice
(
Input
{}));
using
value_type
=
typename
Input
::
type
;
using
value_type
=
typename
Input
::
type
;
constexpr
auto
relements
=
get_shape_c
<
reduce_type
>
{}.
elements
();
constexpr
auto
relements
=
get_shape_c
<
reduce_type
>
{}.
elements
();
if
constexpr
(
vec_size
<
value_type
>
()
>
1
)
if
constexpr
(
vec_size
<
value_type
>
()
>
1
)
...
@@ -234,12 +311,69 @@ struct block
...
@@ -234,12 +311,69 @@ struct block
else
else
return
relements
;
return
relements
;
}
}
};
struct
block
{
template
<
class
Slicer
>
struct
reducer
:
reducer_base
<
reducer
<
Slicer
>>
{
index
idx
;
Slicer
slice
;
template
<
class
T
,
index_int
N
,
class
Size
>
struct
inner_storage
:
inner_storage_tag
{
using
type
=
T
;
array
<
T
,
N
>
arr
;
constexpr
Size
rsize
()
const
{
return
{};
}
template
<
class
U
,
class
V
>
constexpr
auto
&
operator
()(
U
,
V
d
)
const
{
return
arr
[
d
];
}
template
<
class
U
,
class
V
>
constexpr
auto
&
operator
()(
U
,
V
d
)
{
return
arr
[
d
];
}
};
template
<
class
Op
,
class
T
,
class
Read
,
class
N
,
class
...
Ts
>
__device__
auto
reduce_impl
(
Op
op
,
T
init
,
Read
read
,
N
n
,
Ts
&&
...
xs
)
const
{
return
block_reduce
(
idx
,
op
,
init
,
n
,
[
&
](
auto
j
,
auto
d
)
{
return
vec_reduce
(
read
(
xs
(
j
,
d
)...),
op
);
});
}
template
<
class
F
>
__device__
void
outer
(
F
f
)
const
{
if
(
idx
.
local
==
0
)
f
();
}
template
<
class
F
,
class
N
,
class
...
Ts
>
__device__
void
inner_void_impl
(
F
f
,
N
n
,
Ts
&&
...
xs
)
const
{
idx
.
local_stride
(
n
,
[
&
](
auto
j
,
auto
d
)
{
f
(
xs
(
j
,
d
)...);
});
}
template
<
class
R
,
class
F
,
class
N
,
class
...
Ts
>
__device__
auto
inner_impl
(
F
f
,
N
n
,
Ts
&&
...
xs
)
const
{
using
max_iterations
=
decltype
(
idx
.
max_local_stride_iterations
(
n
));
inner_storage
<
R
,
max_iterations
{},
N
>
storage
;
idx
.
local_stride
(
n
,
[
&
](
auto
j
,
auto
d
)
{
storage
(
j
,
d
)
=
f
(
xs
(
j
,
d
)...);
});
return
storage
;
}
};
};
template
<
class
Slicer
>
template
<
class
Slicer
>
static
__device__
auto
make
(
index
idx
,
Slicer
slicer
)
static
__device__
auto
make
(
index
idx
,
Slicer
slicer
)
{
{
return
reducer
<
Slicer
>
{
idx
,
slicer
};
return
reducer
<
Slicer
>
{
{},
idx
,
slicer
};
}
}
template
<
class
Output
,
class
F
>
template
<
class
Output
,
class
F
>
...
@@ -254,56 +388,143 @@ struct block
...
@@ -254,56 +388,143 @@ struct block
}
}
};
};
struct
lan
e
struct
block_larg
e
{
{
template
<
class
Slicer
>
template
<
class
Slicer
>
struct
reducer
struct
reducer
:
reducer_base
<
reducer
<
Slicer
>>
{
{
index
idx
;
index
idx
;
Slicer
slice
;
Slicer
slice
;
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
template
<
class
Size
,
class
F
>
struct
inner_storage
:
inner_storage_tag
{
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
using
type
=
remove_reference_t
<
decltype
(
declval
<
F
>
()(
0
,
_c
<
0
>
))
>
;
using
type
=
typename
decltype
(
x
)
::
type
;
F
f
;
type
r
=
init
;
constexpr
Size
rsize
()
const
{
return
{};
}
for
(
index_int
j
=
0
;
j
<
x
.
get_shape
().
elements
();
j
++
)
template
<
class
U
,
class
V
>
constexpr
auto
operator
()(
U
j
,
V
d
)
const
{
{
r
=
op
(
r
,
read
(
x
[
j
],
xs
[
j
]...)
);
return
f
(
j
,
d
);
}
}
return
r
;
};
template
<
class
Size
,
class
F
>
constexpr
inner_storage
<
Size
,
F
>
make_inner_storage
(
Size
,
F
f
)
{
return
{
f
};
}
template
<
class
Op
,
class
T
,
class
Read
,
class
N
,
class
...
Ts
>
__device__
auto
reduce_impl
(
Op
op
,
T
init
,
Read
read
,
N
n
,
Ts
&&
...
xs
)
const
{
return
block_reduce
(
idx
,
op
,
init
,
index_int
{
n
},
[
&
](
auto
j
,
auto
d
)
{
return
vec_reduce
(
read
(
xs
(
j
,
d
)...),
op
);
});
});
}
}
template
<
class
F
>
template
<
class
F
>
__device__
void
outer
(
F
f
)
const
__device__
void
outer
(
F
f
)
const
{
{
if
(
idx
.
local
==
0
)
f
();
f
();
}
}
template
<
class
F
>
template
<
class
F
,
class
N
,
class
...
Ts
>
__device__
auto
inner
(
F
f
)
const
__device__
void
inner
_void_impl
(
F
f
,
N
n
,
Ts
&&
...
xs
)
const
{
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
idx
.
local_stride
(
index_int
{
n
},
[
&
](
auto
j
,
auto
d
)
{
f
(
xs
(
j
,
d
)...);
});
for
(
index_int
j
=
0
;
j
<
x
.
get_shape
().
elements
();
j
++
)
}
template
<
class
R
,
class
F
,
class
N
,
class
...
Ts
>
__device__
auto
inner_impl
(
F
f
,
N
n
,
Ts
&&
...
xs
)
const
{
return
make_inner_storage
(
n
,
[
=
](
auto
j
,
auto
d
)
{
return
f
(
xs
(
j
,
d
)...);
});
}
};
template
<
class
Slicer
>
static
__device__
auto
make
(
index
idx
,
Slicer
slicer
)
{
{
f
(
x
[
j
],
xs
[
j
]...)
;
return
reducer
<
Slicer
>
{{},
idx
,
slicer
}
;
}
}
template
<
class
Output
,
class
F
>
static
__device__
void
run
(
F
f
)
{
auto
idx
=
make_index
();
constexpr
auto
nelements
=
get_shape_c
<
Output
>
{}.
elements
();
idx
.
global_stride
(
nelements
*
idx
.
nlocal
(),
[
&
](
auto
i
)
{
const
auto
out_idx
=
get_shape_c
<
Output
>
{}.
multi
(
i
/
idx
.
nlocal
());
f
(
out_idx
,
make
(
idx
,
[
&
](
auto
input
)
{
return
reduce_slice
<
Output
>
(
input
,
out_idx
);
}));
});
});
}
}
};
template
<
class
Input
>
struct
lane
constexpr
auto
elements
()
const
{
template
<
class
Slicer
>
struct
reducer
:
reducer_base
<
reducer
<
Slicer
>>
{
{
using
reduce_type
=
decltype
(
slice
(
Input
{}));
index
idx
;
return
get_shape_c
<
reduce_type
>
{}.
elements
();
Slicer
slice
;
template
<
class
Size
,
class
F
>
struct
inner_storage
:
inner_storage_tag
{
using
type
=
remove_reference_t
<
decltype
(
declval
<
F
>
()(
0
,
_c
<
0
>
))
>
;
F
f
;
constexpr
Size
rsize
()
const
{
return
{};
}
template
<
class
U
,
class
V
>
constexpr
auto
operator
()(
U
j
,
V
d
)
const
{
return
f
(
j
,
d
);
}
}
};
};
template
<
class
Size
,
class
F
>
constexpr
inner_storage
<
Size
,
F
>
make_inner_storage
(
Size
,
F
f
)
{
return
{
f
};
}
template
<
class
Op
,
class
T
,
class
Read
,
class
N
,
class
U
,
class
...
Us
>
__device__
auto
reduce_impl
(
Op
op
,
T
init
,
Read
read
,
N
n
,
U
&&
x
,
Us
&&
...
xs
)
const
{
using
type
=
remove_reference_t
<
decltype
(
x
(
0
,
_c
<
0
>
))
>
;
type
r
=
init
;
for
(
index_int
j
=
0
;
j
<
n
;
j
++
)
{
r
=
op
(
r
,
read
(
x
(
j
,
_c
<
0
>
),
xs
(
j
,
_c
<
0
>
)...));
}
return
r
;
}
template
<
class
F
>
__device__
void
outer
(
F
f
)
const
{
f
();
}
template
<
class
F
,
class
N
,
class
...
Ts
>
__device__
void
inner_void_impl
(
F
f
,
N
n
,
Ts
&&
...
xs
)
const
{
for
(
index_int
j
=
0
;
j
<
n
;
j
++
)
{
f
(
xs
(
j
,
_c
<
0
>
)...);
}
}
template
<
class
R
,
class
F
,
class
N
,
class
...
Ts
>
__device__
auto
inner_impl
(
F
f
,
N
n
,
Ts
&&
...
xs
)
const
{
return
make_inner_storage
(
n
,
[
=
](
auto
j
,
auto
d
)
{
return
f
(
xs
(
j
,
d
)...);
});
}
};
template
<
class
Slicer
>
template
<
class
Slicer
>
static
__device__
auto
make
(
index
idx
,
Slicer
slicer
)
static
__device__
auto
make
(
index
idx
,
Slicer
slicer
)
{
{
return
reducer
<
Slicer
>
{
idx
,
slicer
};
return
reducer
<
Slicer
>
{
{},
idx
,
slicer
};
}
}
template
<
class
Output
,
class
F
>
template
<
class
Output
,
class
F
>
...
@@ -318,6 +539,26 @@ struct lane
...
@@ -318,6 +539,26 @@ struct lane
}
}
};
};
// TODO: Remove these in the future when they can be selected in the compiler class
template
<
index_int
RElements
>
constexpr
auto
pick_block
()
{
using
nlocal
=
decltype
(
index
{}.
max_nlocal
());
if
constexpr
(
RElements
<
nlocal
{}
*
256
)
return
block
{};
else
return
block_large
{};
}
template
<
index_int
RElements
>
using
auto_block
=
decltype
(
pick_block
<
RElements
>
());
template
<
class
Input
,
index_int
Axis
>
constexpr
auto
reduce_elements_with_axis
()
{
constexpr
auto
s
=
get_shape_c
<
Input
>
{};
return
s
.
lens
[
Axis
];
}
}
// namespace reduce
}
// namespace reduce
template
<
class
Algo
,
template
<
class
Algo
,
...
...
src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
View file @
5af9aac0
...
@@ -76,14 +76,6 @@ struct shape
...
@@ -76,14 +76,6 @@ struct shape
constexpr
index_int
index
(
index_array
x
)
const
{
return
x
.
dot
(
strides
);
}
constexpr
index_int
index
(
index_array
x
)
const
{
return
x
.
dot
(
strides
);
}
constexpr
index_int
index
(
std
::
initializer_list
<
index_int
>
x
)
const
{
index_int
idx
=
0
;
for
(
index_int
i
=
0
;
i
<
x
.
size
();
i
++
)
idx
+=
*
(
x
.
begin
()
+
i
)
*
strides
[
i
];
return
idx
;
}
constexpr
index_int
index
(
index_int
i
)
const
constexpr
index_int
index
(
index_int
i
)
const
{
{
if
(
this
->
standard
())
if
(
this
->
standard
())
...
...
src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
View file @
5af9aac0
...
@@ -30,18 +30,20 @@
...
@@ -30,18 +30,20 @@
namespace
migraphx
{
namespace
migraphx
{
template
<
index_int
Axis
,
class
Input
,
class
Output
>
template
<
index_int
Axis
,
class
Input
,
class
Output
>
__device__
void
softmax
(
Input
input
,
Output
output
)
__device__
void
softmax
(
Input
input
1
,
Output
output
)
{
{
reduce
::
block
::
run
<
reduce
::
with_axis
<
Input
,
Axis
>>
([
&
](
auto
,
auto
r
)
{
using
block
=
reduce
::
auto_block
<
reduce
::
reduce_elements_with_axis
<
Input
,
Axis
>
()
>
;
block
::
template
run
<
reduce
::
with_axis
<
Input
,
Axis
>
>
([
&
](
auto
,
auto
r
)
{
auto
input
=
r
.
inner
(
op
::
id
{})(
input1
);
#ifdef MIGRAPHX_USE_FAST_SOFTMAX
#ifdef MIGRAPHX_USE_FAST_SOFTMAX
const
auto
c
=
vec_at
(
r
.
slice
(
input
)[
0
],
0
);
const
auto
c
=
vec_at
(
r
.
slice
(
input
1
)[
0
],
0
);
#else
#else
const
auto
c
=
r
.
reduce
(
op
::
max
{},
lowest
{},
op
::
id
{})(
input
);
const
auto
c
=
r
.
reduce
(
op
::
max
{},
lowest
{},
op
::
id
{})(
input
);
#endif
#endif
auto
batch_sum
=
r
.
reduce
(
op
::
sum
{},
0
,
[
&
](
auto
x
)
{
auto
exp_in
=
r
.
inner
([
&
](
auto
x
)
{
return
migraphx
::
exp
(
x
-
c
);
})(
input
);
return
migraphx
::
convert
<
float
>
(
migraphx
::
exp
(
x
-
c
));
auto
batch_sum
=
})(
input
);
r
.
reduce
(
op
::
sum
{},
0
,
[](
auto
x
)
{
return
migraphx
::
convert
<
float
>
(
x
);
})(
exp_in
);
r
.
inner
([
&
](
auto
&
y
,
auto
x
)
{
y
=
migraphx
::
exp
(
x
-
c
)
/
batch_sum
;
})(
output
,
input
);
r
.
inner
([
&
](
auto
&
y
,
auto
x
)
{
y
=
x
/
batch_sum
;
})(
output
,
exp_in
);
});
});
}
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
View file @
5af9aac0
...
@@ -141,6 +141,25 @@ MIGRAPHX_BUILTIN_TYPE_TRAITN(is_constructible);
...
@@ -141,6 +141,25 @@ MIGRAPHX_BUILTIN_TYPE_TRAITN(is_constructible);
MIGRAPHX_BUILTIN_TYPE_TRAITN
(
is_nothrow_constructible
);
MIGRAPHX_BUILTIN_TYPE_TRAITN
(
is_nothrow_constructible
);
MIGRAPHX_BUILTIN_TYPE_TRAITN
(
is_trivially_constructible
);
MIGRAPHX_BUILTIN_TYPE_TRAITN
(
is_trivially_constructible
);
template
<
class
T
>
struct
remove_cv
{
using
type
=
T
;
};
template
<
class
T
>
struct
remove_cv
<
const
T
>
:
remove_cv
<
T
>
{
};
template
<
class
T
>
struct
remove_cv
<
volatile
T
>
:
remove_cv
<
T
>
{
};
template
<
class
T
>
using
remove_cv_t
=
typename
remove_cv
<
T
>::
type
;
template
<
class
T
>
template
<
class
T
>
struct
remove_reference
struct
remove_reference
{
{
...
@@ -168,6 +187,11 @@ struct add_pointer : type_identity<typename remove_reference<T>::type*>
...
@@ -168,6 +187,11 @@ struct add_pointer : type_identity<typename remove_reference<T>::type*>
template
<
class
T
>
template
<
class
T
>
using
add_pointer_t
=
typename
add_pointer
<
T
>::
type
;
using
add_pointer_t
=
typename
add_pointer
<
T
>::
type
;
template
<
class
T
>
struct
is_void
:
is_same
<
void
,
remove_cv_t
<
T
>>
{
};
template
<
class
...
Ts
>
template
<
class
...
Ts
>
struct
common_type
;
struct
common_type
;
...
...
src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
View file @
5af9aac0
...
@@ -28,8 +28,45 @@
...
@@ -28,8 +28,45 @@
namespace
migraphx
{
namespace
migraphx
{
using
index_int
=
std
::
uint32_t
;
#if defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS) and defined(MIGRAPHX_USE_HIPRTC)
using
diff_int
=
std
::
int32_t
;
using
int8_t
=
signed
char
;
using
uint8_t
=
unsigned
char
;
using
int16_t
=
signed
short
;
using
uint16_t
=
unsigned
short
;
using
int32_t
=
signed
int
;
using
uint32_t
=
unsigned
int
;
using
int64_t
=
signed
long
long
;
using
uint64_t
=
unsigned
long
long
;
#elif defined(MIGRAPHX_USE_HIPRTC)
using
int8_t
=
__hip_int8_t
;
using
uint8_t
=
__hip_uint8_t
;
using
int16_t
=
__hip_int16_t
;
using
uint16_t
=
__hip_uint16_t
;
using
int32_t
=
__hip_int32_t
;
using
uint32_t
=
__hip_uint32_t
;
using
int64_t
=
__hip_int64_t
;
using
uint64_t
=
__hip_uint64_t
;
#else
using
int8_t
=
std
::
int8_t
;
using
uint8_t
=
std
::
uint8_t
;
using
int16_t
=
std
::
int16_t
;
using
uint16_t
=
std
::
uint16_t
;
using
int32_t
=
std
::
int32_t
;
using
uint32_t
=
std
::
uint32_t
;
using
int64_t
=
std
::
int64_t
;
using
uint64_t
=
std
::
uint64_t
;
#endif // MIGRAPHX_USE_HIPRTC
using
index_int
=
uint32_t
;
using
diff_int
=
int32_t
;
static_assert
(
sizeof
(
int8_t
)
==
1
,
"int8_t must be 1 bytes"
);
static_assert
(
sizeof
(
uint8_t
)
==
1
,
"uint8_t must be 1 bytes"
);
static_assert
(
sizeof
(
int16_t
)
==
2
,
"int16_t must be 2 bytes"
);
static_assert
(
sizeof
(
uint16_t
)
==
2
,
"uint16_t must be 2 bytes"
);
static_assert
(
sizeof
(
int32_t
)
==
4
,
"int32_t must be 4 bytes"
);
static_assert
(
sizeof
(
uint32_t
)
==
4
,
"uint32_t must be 4 bytes"
);
static_assert
(
sizeof
(
int64_t
)
==
8
,
"int64_t must be 8 bytes"
);
static_assert
(
sizeof
(
uint64_t
)
==
8
,
"uint64_t must be 8 bytes"
);
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
...
...
src/targets/gpu/lowering.cpp
View file @
5af9aac0
...
@@ -83,7 +83,6 @@ struct miopen_apply
...
@@ -83,7 +83,6 @@ struct miopen_apply
auto
&
ctx
=
get_context
();
auto
&
ctx
=
get_context
();
int8_x4_format
=
get_int8_x4_format
(
ctx
);
int8_x4_format
=
get_int8_x4_format
(
ctx
);
compute_fp32
=
get_compute_fp32_flag
();
compute_fp32
=
get_compute_fp32_flag
();
offload_copy
=
(
mod
->
name
()
==
"main"
)
?
pass
->
offload_copy
:
false
;
offload_copy
=
(
mod
->
name
()
==
"main"
)
?
pass
->
offload_copy
:
false
;
add_generic_op
(
"contiguous"
);
add_generic_op
(
"contiguous"
);
...
@@ -112,6 +111,7 @@ struct miopen_apply
...
@@ -112,6 +111,7 @@ struct miopen_apply
add_loop_op
();
add_loop_op
();
add_neg_op
();
add_neg_op
();
add_nms_op
();
add_nms_op
();
add_select_module_op
();
}
}
void
copy_params
()
const
void
copy_params
()
const
...
@@ -359,6 +359,33 @@ struct miopen_apply
...
@@ -359,6 +359,33 @@ struct miopen_apply
return
mod
->
replace_instruction
(
ins
,
gpu_out
);
return
mod
->
replace_instruction
(
ins
,
gpu_out
);
});
});
}
}
/**
* Turns on use_local_alloc in the select_module submodules.
* Changes the submodule returns to a hip::sync_stream.
*/
void
add_select_module_op
()
{
apply_map
.
emplace
(
"select_module"
,
[
=
](
instruction_ref
ins
)
{
std
::
vector
<
instruction_ref
>
inputs
=
ins
->
inputs
();
auto
mod_args
=
ins
->
module_inputs
();
for
(
auto
*
smod
:
mod_args
)
{
smod
->
use_local_alloc
=
true
;
auto
last_ins
=
std
::
prev
(
smod
->
end
());
if
(
last_ins
->
name
()
==
"@return"
)
{
for
(
auto
out_ins
:
last_ins
->
inputs
())
{
auto
sync_out
=
smod
->
insert_instruction
(
last_ins
,
make_op
(
"hip::sync_stream"
),
out_ins
);
smod
->
replace_return
({
sync_out
});
}
}
}
return
ins
;
});
}
};
};
void
lowering
::
apply
(
module
&
m
)
const
{
miopen_apply
{
&
m
,
this
}.
apply
();
}
void
lowering
::
apply
(
module
&
m
)
const
{
miopen_apply
{
&
m
,
this
}.
apply
();
}
...
...
src/targets/gpu/prefuse_ops.cpp
View file @
5af9aac0
...
@@ -26,6 +26,8 @@
...
@@ -26,6 +26,8 @@
#include <migraphx/check_shapes.hpp>
#include <migraphx/check_shapes.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/pass_manager.hpp>
#include <migraphx/dead_code_elimination.hpp>
namespace
migraphx
{
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
@@ -90,7 +92,9 @@ struct find_layernorm
...
@@ -90,7 +92,9 @@ struct find_layernorm
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
eps
=
r
.
instructions
[
"eps"
]
->
eval
().
at
<
float
>
();
float
eps
=
0
;
if
(
contains
(
r
.
instructions
,
"eps"
))
eps
=
r
.
instructions
[
"eps"
]
->
eval
().
at
<
float
>
();
m
.
replace_instruction
(
ins
,
layernorm
{
eps
},
x_ins
);
m
.
replace_instruction
(
ins
,
layernorm
{
eps
},
x_ins
);
}
}
...
@@ -100,23 +104,26 @@ struct find_add_layernorm
...
@@ -100,23 +104,26 @@ struct find_add_layernorm
{
{
auto
matcher
()
const
auto
matcher
()
const
{
{
return
match
::
layernorm
()(
match
::
var
(
"x"
)(
match
::
name
(
"add"
).
bind
(
"add"
)));
return
match
::
name
(
"gpu::prelayernorm"
)(
match
::
args
(
match
::
name
(
"add"
)(
match
::
used_once
()).
bind
(
"add"
)));
}
}
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
eps
=
r
.
instructions
[
"eps"
]
->
eval
().
at
<
float
>
(
);
auto
op
=
any_cast
<
layernorm
>
(
ins
->
get_operator
()
);
m
.
replace_instruction
(
ins
,
add_layernorm
{
eps
},
add_ins
->
inputs
());
m
.
replace_instruction
(
ins
,
add_layernorm
{
op
.
epsilon
},
add_ins
->
inputs
());
}
}
};
};
}
// namespace
}
// namespace
void
prefuse_ops
::
apply
(
module
&
m
)
const
void
prefuse_ops
::
apply
(
module
_pass_manager
&
mp
m
)
const
{
{
match
::
find_matches
(
m
,
find_add_layernorm
{},
find_layernorm
{});
match
::
find_matches
(
mpm
.
get_module
(),
find_layernorm
{});
mpm
.
run_pass
(
dead_code_elimination
{});
match
::
find_matches
(
mpm
.
get_module
(),
find_add_layernorm
{});
}
}
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/target.cpp
View file @
5af9aac0
...
@@ -38,6 +38,7 @@
...
@@ -38,6 +38,7 @@
#include <migraphx/layout_nhwc.hpp>
#include <migraphx/layout_nhwc.hpp>
#include <migraphx/memory_coloring.hpp>
#include <migraphx/memory_coloring.hpp>
#include <migraphx/normalize_ops.hpp>
#include <migraphx/normalize_ops.hpp>
#include <migraphx/optimize_module.hpp>
#include <migraphx/preallocate_param.hpp>
#include <migraphx/preallocate_param.hpp>
#include <migraphx/propagate_constant.hpp>
#include <migraphx/propagate_constant.hpp>
#include <migraphx/register_target.hpp>
#include <migraphx/register_target.hpp>
...
@@ -50,6 +51,7 @@
...
@@ -50,6 +51,7 @@
#include <migraphx/simplify_algebra.hpp>
#include <migraphx/simplify_algebra.hpp>
#include <migraphx/simplify_qdq.hpp>
#include <migraphx/simplify_qdq.hpp>
#include <migraphx/simplify_reshapes.hpp>
#include <migraphx/simplify_reshapes.hpp>
#include <migraphx/split_single_dyn_dim.hpp>
#include <migraphx/gpu/allocation_model.hpp>
#include <migraphx/gpu/allocation_model.hpp>
#include <migraphx/gpu/compile_miopen.hpp>
#include <migraphx/gpu/compile_miopen.hpp>
#include <migraphx/gpu/compile_ops.hpp>
#include <migraphx/gpu/compile_ops.hpp>
...
@@ -90,6 +92,7 @@ pass enable_pass(bool enabled, pass p)
...
@@ -90,6 +92,7 @@ pass enable_pass(bool enabled, pass p)
std
::
vector
<
pass
>
target
::
get_passes
(
migraphx
::
context
&
gctx
,
const
compile_options
&
options
)
const
std
::
vector
<
pass
>
target
::
get_passes
(
migraphx
::
context
&
gctx
,
const
compile_options
&
options
)
const
{
{
auto
&
ctx
=
any_cast
<
context
>
(
gctx
);
auto
&
ctx
=
any_cast
<
context
>
(
gctx
);
ctx
.
set_exhaustive_tune_flag
(
options
.
exhaustive_tune
);
std
::
set
<
shape
::
type_t
>
unsupported_types
(
shape
::
types
().
begin
(),
shape
::
types
().
end
());
std
::
set
<
shape
::
type_t
>
unsupported_types
(
shape
::
types
().
begin
(),
shape
::
types
().
end
());
unsupported_types
.
erase
(
shape
::
type_t
::
float_type
);
unsupported_types
.
erase
(
shape
::
type_t
::
float_type
);
unsupported_types
.
erase
(
shape
::
type_t
::
half_type
);
unsupported_types
.
erase
(
shape
::
type_t
::
half_type
);
...
@@ -100,6 +103,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
...
@@ -100,6 +103,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
// clang-format off
// clang-format off
return
return
{
{
split_single_dyn_dim
{},
dead_code_elimination
{},
normalize_ops
{},
normalize_ops
{},
dead_code_elimination
{},
dead_code_elimination
{},
simplify_qdq
{},
simplify_qdq
{},
...
@@ -118,21 +123,13 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
...
@@ -118,21 +123,13 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
rewrite_pooling
{},
rewrite_pooling
{},
dead_code_elimination
{},
dead_code_elimination
{},
rewrite_gelu
{},
rewrite_gelu
{},
dead_code_elimination
{},
optimize_module
{},
eliminate_common_subexpression
{},
dead_code_elimination
{},
simplify_algebra
{},
simplify_reshapes
{},
enable_pass
(
enabled
(
MIGRAPHX_ENABLE_NHWC
{}),
layout_nhwc
{}),
enable_pass
(
enabled
(
MIGRAPHX_ENABLE_NHWC
{}),
layout_nhwc
{}),
dead_code_elimination
{},
dead_code_elimination
{},
simplify_reshapes
{},
simplify_algebra
{},
prefuse_ops
{},
prefuse_ops
{},
dead_code_elimination
{},
dead_code_elimination
{},
auto_contiguous
{},
auto_contiguous
{},
simplify_reshapes
{},
optimize_module
{},
propagate_constant
{},
dead_code_elimination
{},
enable_pass
(
not
enabled
(
MIGRAPHX_DISABLE_POINTWISE_FUSION
{}),
fuse_pointwise
{}),
enable_pass
(
not
enabled
(
MIGRAPHX_DISABLE_POINTWISE_FUSION
{}),
fuse_pointwise
{}),
dead_code_elimination
{},
dead_code_elimination
{},
fuse_mlir
{
&
ctx
},
fuse_mlir
{
&
ctx
},
...
...
test/CMakeLists.txt
View file @
5af9aac0
#####################################################################################
#
####################################################################################
# The MIT License (MIT)
# The MIT License (MIT)
#
#
# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
...
@@ -20,7 +20,7 @@
...
@@ -20,7 +20,7 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# THE SOFTWARE.
#####################################################################################
#
####################################################################################
cmake_policy
(
SET CMP0057 NEW
)
cmake_policy
(
SET CMP0057 NEW
)
...
@@ -49,9 +49,11 @@ function(add_test_command NAME EXE)
...
@@ -49,9 +49,11 @@ function(add_test_command NAME EXE)
set_tests_properties
(
${
NAME
}
PROPERTIES DISABLED On
)
set_tests_properties
(
${
NAME
}
PROPERTIES DISABLED On
)
elseif
(
WIN32
)
elseif
(
WIN32
)
set
(
WINPATH
)
set
(
WINPATH
)
foreach
(
PATH
${
CMAKE_FIND_ROOT_PATH
}
)
foreach
(
PATH
${
CMAKE_FIND_ROOT_PATH
}
)
list
(
APPEND WINPATH
${
PATH
}
/bin
)
list
(
APPEND WINPATH
${
PATH
}
/bin
)
endforeach
()
endforeach
()
file
(
GENERATE OUTPUT
"
${
CMAKE_CURRENT_BINARY_DIR
}
/test_
${
NAME
}
.cmd"
file
(
GENERATE OUTPUT
"
${
CMAKE_CURRENT_BINARY_DIR
}
/test_
${
NAME
}
.cmd"
CONTENT
"set PATH=
${
WINPATH
}
;%PATH%
CONTENT
"set PATH=
${
WINPATH
}
;%PATH%
%1
${
ARGN
}
"
)
%1
${
ARGN
}
"
)
...
@@ -67,9 +69,11 @@ function(add_test_command NAME EXE)
...
@@ -67,9 +69,11 @@ function(add_test_command NAME EXE)
# --args $<TARGET_FILE:${EXE}> ${ARGN})
# --args $<TARGET_FILE:${EXE}> ${ARGN})
set
(
TEST_DIR
${
CMAKE_CURRENT_BINARY_DIR
}
/gdb/test_
${
NAME
}
)
set
(
TEST_DIR
${
CMAKE_CURRENT_BINARY_DIR
}
/gdb/test_
${
NAME
}
)
file
(
MAKE_DIRECTORY
${
TEST_DIR
}
)
file
(
MAKE_DIRECTORY
${
TEST_DIR
}
)
if
(
NOT EXISTS
${
TEST_DIR
}
)
if
(
NOT EXISTS
${
TEST_DIR
}
)
message
(
FATAL_ERROR
"Failed to create test directory:
${
TEST_DIR
}
"
)
message
(
FATAL_ERROR
"Failed to create test directory:
${
TEST_DIR
}
"
)
endif
()
endif
()
file
(
GENERATE OUTPUT
"
${
TEST_DIR
}
/run.cmake"
file
(
GENERATE OUTPUT
"
${
TEST_DIR
}
/run.cmake"
CONTENT
"
CONTENT
"
# Remove previous core dump
# Remove previous core dump
...
@@ -90,22 +94,27 @@ function(add_test_command NAME EXE)
...
@@ -90,22 +94,27 @@ function(add_test_command NAME EXE)
add_test
(
NAME
${
NAME
}
COMMAND
${
EXE
}
${
ARGN
}
)
add_test
(
NAME
${
NAME
}
COMMAND
${
EXE
}
${
ARGN
}
)
endif
()
endif
()
endif
()
endif
()
set_tests_properties
(
${
NAME
}
PROPERTIES FAIL_REGULAR_EXPRESSION
"FAILED"
)
set_tests_properties
(
${
NAME
}
PROPERTIES FAIL_REGULAR_EXPRESSION
"FAILED"
)
endfunction
()
endfunction
()
function
(
add_test_executable TEST_NAME
)
function
(
add_test_executable TEST_NAME
)
add_executable
(
${
TEST_NAME
}
EXCLUDE_FROM_ALL
${
ARGN
}
)
add_executable
(
${
TEST_NAME
}
EXCLUDE_FROM_ALL
${
ARGN
}
)
target_link_libraries
(
${
TEST_NAME
}
${
CMAKE_THREAD_LIBS_INIT
}
)
target_link_libraries
(
${
TEST_NAME
}
${
CMAKE_THREAD_LIBS_INIT
}
)
# Cmake does not add flags correctly for gcc
# Cmake does not add flags correctly for gcc
if
(
CMAKE_CXX_COMPILER_ID MATCHES
"GNU"
)
if
(
CMAKE_CXX_COMPILER_ID MATCHES
"GNU"
)
set_target_properties
(
${
TEST_NAME
}
PROPERTIES COMPILE_FLAGS -pthread LINK_FLAGS -pthread
)
set_target_properties
(
${
TEST_NAME
}
PROPERTIES COMPILE_FLAGS -pthread LINK_FLAGS -pthread
)
endif
()
endif
()
separate_arguments
(
MIOPEN_TEST_FLAGS_ARGS UNIX_COMMAND
${
MIOPEN_TEST_FLAGS
}
)
separate_arguments
(
MIOPEN_TEST_FLAGS_ARGS UNIX_COMMAND
${
MIOPEN_TEST_FLAGS
}
)
if
(
MIOPEN_TEST_ALL
)
if
(
MIOPEN_TEST_ALL
)
set
(
TEST_COMMAND
${
TEST_NAME
}
${
MIOPEN_TEST_FLOAT_ARG
}
--all
${
MIOPEN_TEST_FLAGS_ARGS
}
)
set
(
TEST_COMMAND
${
TEST_NAME
}
${
MIOPEN_TEST_FLOAT_ARG
}
--all
${
MIOPEN_TEST_FLAGS_ARGS
}
)
else
()
else
()
set
(
TEST_COMMAND
${
TEST_NAME
}
${
MIOPEN_TEST_FLOAT_ARG
}
${
MIOPEN_TEST_FLAGS_ARGS
}
)
set
(
TEST_COMMAND
${
TEST_NAME
}
${
MIOPEN_TEST_FLOAT_ARG
}
${
MIOPEN_TEST_FLAGS_ARGS
}
)
endif
()
endif
()
add_test_command
(
${
TEST_NAME
}
${
TEST_COMMAND
}
)
add_test_command
(
${
TEST_NAME
}
${
TEST_COMMAND
}
)
add_dependencies
(
tests
${
TEST_NAME
}
)
add_dependencies
(
tests
${
TEST_NAME
}
)
add_dependencies
(
check
${
TEST_NAME
}
)
add_dependencies
(
check
${
TEST_NAME
}
)
...
@@ -133,7 +142,7 @@ if(MIGRAPHX_ENABLE_GPU)
...
@@ -133,7 +142,7 @@ if(MIGRAPHX_ENABLE_GPU)
COST 10
COST 10
RESOURCE_LOCK gpu
RESOURCE_LOCK gpu
)
)
target_link_libraries
(
test_gpu_
${
BASE_NAME
}
migraphx_gpu
)
target_link_libraries
(
test_gpu_
${
BASE_NAME
}
migraphx_gpu
migraphx_kernels
)
endforeach
()
endforeach
()
endif
()
endif
()
...
@@ -155,7 +164,8 @@ endif()
...
@@ -155,7 +164,8 @@ endif()
# Onnx test
# Onnx test
set
(
TEST_ONNX_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/onnx
)
set
(
TEST_ONNX_DIR
${
CMAKE_CURRENT_SOURCE_DIR
}
/onnx
)
file
(
GLOB ONNX_TESTS
${
TEST_ONNX_DIR
}
/*.cpp
)
file
(
GLOB ONNX_TESTS
${
TEST_ONNX_DIR
}
/*.cpp
)
foreach
(
ONNX_TEST
${
ONNX_TESTS
}
)
foreach
(
ONNX_TEST
${
ONNX_TESTS
}
)
get_filename_component
(
BASE_NAME
${
ONNX_TEST
}
NAME_WE
)
get_filename_component
(
BASE_NAME
${
ONNX_TEST
}
NAME_WE
)
set
(
TEST_NAME test_
${
BASE_NAME
}
)
set
(
TEST_NAME test_
${
BASE_NAME
}
)
...
@@ -180,12 +190,12 @@ add_dependencies(check test_tf)
...
@@ -180,12 +190,12 @@ add_dependencies(check test_tf)
add_subdirectory
(
api
)
add_subdirectory
(
api
)
add_subdirectory
(
verify
)
add_subdirectory
(
verify
)
if
(
MIGRAPHX_ENABLE_PYTHON
)
if
(
MIGRAPHX_ENABLE_PYTHON
)
add_subdirectory
(
py
)
add_subdirectory
(
py
)
endif
()
endif
()
function
(
test_header NAME HEADER
)
function
(
test_header NAME HEADER
)
file
(
WRITE
${
CMAKE_CURRENT_BINARY_DIR
}
/header-main-include-
${
NAME
}
.cpp
file
(
WRITE
${
CMAKE_CURRENT_BINARY_DIR
}
/header-main-include-
${
NAME
}
.cpp
"#include <
${
HEADER
}
>
\n
int main() {}
\n
"
"#include <
${
HEADER
}
>
\n
int main() {}
\n
"
)
)
...
@@ -206,6 +216,7 @@ function(test_headers PREFIX)
...
@@ -206,6 +216,7 @@ function(test_headers PREFIX)
string
(
MAKE_C_IDENTIFIER
${
HEADER_REL
}
TEST_NAME
)
string
(
MAKE_C_IDENTIFIER
${
HEADER_REL
}
TEST_NAME
)
get_filename_component
(
BASE_NAME
${
HEADER
}
NAME_WE
)
get_filename_component
(
BASE_NAME
${
HEADER
}
NAME_WE
)
test_header
(
header_
${
TEST_NAME
}
${
PREFIX
}
/
${
BASE_NAME
}
.hpp
)
test_header
(
header_
${
TEST_NAME
}
${
PREFIX
}
/
${
BASE_NAME
}
.hpp
)
if
(
MIGRAPHX_ENABLE_GPU
)
if
(
MIGRAPHX_ENABLE_GPU
)
target_link_libraries
(
header_
${
TEST_NAME
}
migraphx_gpu
)
target_link_libraries
(
header_
${
TEST_NAME
}
migraphx_gpu
)
endif
()
endif
()
...
@@ -214,6 +225,7 @@ endfunction()
...
@@ -214,6 +225,7 @@ endfunction()
test_headers
(
migraphx
${
CMAKE_SOURCE_DIR
}
/src/include/migraphx/*.hpp
)
test_headers
(
migraphx
${
CMAKE_SOURCE_DIR
}
/src/include/migraphx/*.hpp
)
test_headers
(
migraphx/ref
${
CMAKE_SOURCE_DIR
}
/src/targets/ref/include/migraphx/ref/*.hpp
)
test_headers
(
migraphx/ref
${
CMAKE_SOURCE_DIR
}
/src/targets/ref/include/migraphx/ref/*.hpp
)
if
(
MIGRAPHX_ENABLE_GPU
)
if
(
MIGRAPHX_ENABLE_GPU
)
test_headers
(
migraphx/gpu
${
CMAKE_SOURCE_DIR
}
/src/targets/gpu/include/migraphx/gpu/*.hpp
)
test_headers
(
migraphx/gpu
${
CMAKE_SOURCE_DIR
}
/src/targets/gpu/include/migraphx/gpu/*.hpp
)
endif
()
endif
()
test/api/test_gpu.cpp
View file @
5af9aac0
...
@@ -35,6 +35,7 @@ TEST_CASE(load_and_run)
...
@@ -35,6 +35,7 @@ TEST_CASE(load_and_run)
auto
shapes_before
=
p
.
get_output_shapes
();
auto
shapes_before
=
p
.
get_output_shapes
();
migraphx
::
compile_options
options
;
migraphx
::
compile_options
options
;
options
.
set_offload_copy
();
options
.
set_offload_copy
();
options
.
set_exhaustive_tune_flag
();
p
.
compile
(
migraphx
::
target
(
"gpu"
),
options
);
p
.
compile
(
migraphx
::
target
(
"gpu"
),
options
);
auto
shapes_after
=
p
.
get_output_shapes
();
auto
shapes_after
=
p
.
get_output_shapes
();
CHECK
(
shapes_before
.
size
()
==
1
);
CHECK
(
shapes_before
.
size
()
==
1
);
...
...
test/api/test_save_load.cpp
View file @
5af9aac0
...
@@ -30,7 +30,6 @@ TEST_CASE(load_save_default)
...
@@ -30,7 +30,6 @@ TEST_CASE(load_save_default)
std
::
string
filename
=
"migraphx_api_load_save.mxr"
;
std
::
string
filename
=
"migraphx_api_load_save.mxr"
;
auto
p1
=
migraphx
::
parse_onnx
(
"conv_relu_maxpool_test.onnx"
);
auto
p1
=
migraphx
::
parse_onnx
(
"conv_relu_maxpool_test.onnx"
);
auto
s1
=
p1
.
get_output_shapes
();
auto
s1
=
p1
.
get_output_shapes
();
migraphx
::
save
(
p1
,
filename
.
c_str
());
migraphx
::
save
(
p1
,
filename
.
c_str
());
auto
p2
=
migraphx
::
load
(
filename
.
c_str
());
auto
p2
=
migraphx
::
load
(
filename
.
c_str
());
auto
s2
=
p2
.
get_output_shapes
();
auto
s2
=
p2
.
get_output_shapes
();
...
...
test/gpu/jit.cpp
View file @
5af9aac0
...
@@ -35,13 +35,14 @@
...
@@ -35,13 +35,14 @@
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx_kernels.hpp>
// NOLINTNEXTLINE
// NOLINTNEXTLINE
const
std
::
string
write_2s
=
R"__migraphx__(
const
std
::
string
write_2s
=
R"__migraphx__(
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
extern "C" {
extern "C" {
__global__ void write(
int8_t
* data)
__global__ void write(
char
* data)
{
{
int num = threadIdx.x + blockDim.x * blockIdx.x;
int num = threadIdx.x + blockDim.x * blockIdx.x;
data[num] = 2;
data[num] = 2;
...
@@ -58,7 +59,7 @@ const std::string add_2s_binary = R"__migraphx__(
...
@@ -58,7 +59,7 @@ const std::string add_2s_binary = R"__migraphx__(
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
extern "C" {
extern "C" {
__global__ void add_2(
std::int8_t* x, std::int8_t
* y)
__global__ void add_2(
char* x, char
* y)
{
{
int num = threadIdx.x + blockDim.x * blockIdx.x;
int num = threadIdx.x + blockDim.x * blockIdx.x;
y[num] = x[num] + 2;
y[num] = x[num] + 2;
...
@@ -137,7 +138,8 @@ int main() {}
...
@@ -137,7 +138,8 @@ int main() {}
const
std
::
string
math_template
=
R"__migraphx__(
const
std
::
string
math_template
=
R"__migraphx__(
#include <migraphx/kernels/pointwise.hpp>
#include <migraphx/kernels/pointwise.hpp>
#include <migraphx/kernels/math.hpp>
#include <migraphx/kernels/math.hpp>
#include <migraphx/kernels/types.hpp>
using namespace migraphx;
extern "C" {
extern "C" {
__global__ void kernel(${type}* p)
__global__ void kernel(${type}* p)
{
{
...
...
test/memory_coloring_test.cpp
View file @
5af9aac0
...
@@ -691,7 +691,7 @@ TEST_CASE(test38)
...
@@ -691,7 +691,7 @@ TEST_CASE(test38)
auto
p83
=
m
.
add_instruction
(
pass_op
{},
p78
,
p77
);
auto
p83
=
m
.
add_instruction
(
pass_op
{},
p78
,
p77
);
m
.
add_instruction
(
pass_op
{},
output
,
p83
,
p63
);
m
.
add_instruction
(
pass_op
{},
output
,
p83
,
p63
);
run_pass
(
m
);
run_pass
(
m
);
CHECK
(
m
.
get_parameter_shape
(
"scratch"
).
bytes
()
==
7225344
);
// Optimal solution is
6422528
CHECK
(
m
.
get_parameter_shape
(
"scratch"
).
bytes
()
==
6422528
);
CHECK
(
no_allocate
(
m
));
CHECK
(
no_allocate
(
m
));
}
}
...
@@ -729,7 +729,7 @@ TEST_CASE(test39)
...
@@ -729,7 +729,7 @@ TEST_CASE(test39)
run_pass
(
*
smod
);
run_pass
(
*
smod
);
}
}
CHECK
(
mm
->
get_parameter_shape
(
"scratch"
).
bytes
()
==
4
);
CHECK
(
mm
->
get_parameter_shape
(
"scratch"
).
bytes
()
==
1
);
CHECK
(
then_mod
->
get_parameter_shape
(
"scratch"
).
bytes
()
==
24
);
CHECK
(
then_mod
->
get_parameter_shape
(
"scratch"
).
bytes
()
==
24
);
CHECK
(
else_mod
->
get_parameter_shape
(
"scratch"
).
bytes
()
==
24
);
CHECK
(
else_mod
->
get_parameter_shape
(
"scratch"
).
bytes
()
==
24
);
CHECK
(
no_allocate
(
*
mm
));
CHECK
(
no_allocate
(
*
mm
));
...
@@ -3374,7 +3374,7 @@ TEST_CASE(rnn_dom)
...
@@ -3374,7 +3374,7 @@ TEST_CASE(rnn_dom)
m
.
add_instruction
(
pass_op
{},
moutput
,
mx250
,
mx249
,
mx248
);
m
.
add_instruction
(
pass_op
{},
moutput
,
mx250
,
mx249
,
mx248
);
run_pass
(
m
);
run_pass
(
m
);
CHECK
(
m
.
get_parameter_shape
(
"scratch"
).
bytes
()
==
1600
);
CHECK
(
m
.
get_parameter_shape
(
"scratch"
).
bytes
()
==
1824
);
// Optimal is
1600
CHECK
(
no_allocate
(
m
));
CHECK
(
no_allocate
(
m
));
CHECK
(
is_disjoint
({
mx0
,
mx8
}));
CHECK
(
is_disjoint
({
mx0
,
mx8
}));
CHECK
(
is_disjoint
({
mx0
,
mx8
}));
CHECK
(
is_disjoint
({
mx0
,
mx8
}));
...
@@ -3790,4 +3790,23 @@ TEST_CASE(literal_test)
...
@@ -3790,4 +3790,23 @@ TEST_CASE(literal_test)
CHECK
(
lit
==
result
);
CHECK
(
lit
==
result
);
}
}
TEST_CASE
(
test_tuple
)
{
migraphx
::
module
m
;
auto
s1
=
migraphx
::
shape
{
migraphx
::
shape
::
float_type
,
{
8
}};
auto
s2
=
migraphx
::
shape
{
migraphx
::
shape
::
half_type
,
{
10
}};
auto
s
=
migraphx
::
shape
{{
s1
,
s2
}};
auto
a1
=
add_alloc
(
m
,
s
);
auto
m1
=
m
.
add_instruction
(
pass_op
{},
a1
);
auto
a2
=
add_alloc
(
m
,
{
migraphx
::
shape
::
float_type
,
{
4
}});
m
.
add_instruction
(
pass_op
{},
a2
,
m1
);
run_pass
(
m
);
CHECK
(
m
.
get_parameter_shape
(
"scratch"
).
bytes
()
==
68
);
CHECK
(
no_allocate
(
m
));
CHECK
(
is_disjoint
({
a1
,
a2
}));
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
test/onnx/.onnxrt-commit
0 → 100644
View file @
5af9aac0
c9a53c925510a101f5ca94d5ecda0924e40a8463
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment