Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
3855c6af
"src/propagate_constant.cpp" did not exist on "ae47918310cbdcafacead1e3126f370ced0032c1"
Commit
3855c6af
authored
Jun 25, 2019
by
Shucai Xiao
Browse files
Merge branch 'opt_log_softmax_new_device_code' into argmax_min
parents
b222af2f
93eae2df
Changes
27
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1020 additions
and
654 deletions
+1020
-654
src/include/migraphx/array.hpp
src/include/migraphx/array.hpp
+75
-0
src/include/migraphx/functional.hpp
src/include/migraphx/functional.hpp
+43
-2
src/include/migraphx/ranges.hpp
src/include/migraphx/ranges.hpp
+22
-0
src/include/migraphx/raw_data.hpp
src/include/migraphx/raw_data.hpp
+19
-0
src/targets/cpu/lowering.cpp
src/targets/cpu/lowering.cpp
+78
-44
src/targets/gpu/device/concat.cpp
src/targets/gpu/device/concat.cpp
+10
-12
src/targets/gpu/device/gather.cpp
src/targets/gpu/device/gather.cpp
+18
-23
src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+60
-0
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+95
-271
src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+89
-0
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
...targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+13
-54
src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
...ts/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+59
-0
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+44
-1
src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
...targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+76
-0
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+170
-0
src/targets/gpu/device/logsoftmax.cpp
src/targets/gpu/device/logsoftmax.cpp
+66
-110
src/targets/gpu/device/pad.cpp
src/targets/gpu/device/pad.cpp
+15
-22
src/targets/gpu/device/softmax.cpp
src/targets/gpu/device/softmax.cpp
+63
-102
src/targets/gpu/fuse_ops.cpp
src/targets/gpu/fuse_ops.cpp
+3
-9
src/targets/gpu/gather.cpp
src/targets/gpu/gather.cpp
+2
-4
No files found.
src/include/migraphx/array.hpp
0 → 100644
View file @
3855c6af
#ifndef MIGRAPHX_GUARD_RTGLIB_ARRAY_HPP
#define MIGRAPHX_GUARD_RTGLIB_ARRAY_HPP
#include <migraphx/config.hpp>
#include <migraphx/functional.hpp>
#include <migraphx/requires.hpp>
#include <type_traits>
#include <array>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
detail
{
template
<
class
R
,
class
...
>
struct
array_type
{
using
type
=
R
;
};
template
<
class
...
Ts
>
struct
array_type
<
void
,
Ts
...
>
:
std
::
common_type
<
Ts
...
>
{
};
template
<
class
R
,
class
...
Ts
>
using
array_type_t
=
typename
array_type
<
R
,
Ts
...
>::
type
;
template
<
class
T
,
std
::
size_t
N
,
std
::
size_t
...
I
>
constexpr
std
::
array
<
std
::
remove_cv_t
<
T
>
,
N
>
to_array_impl
(
T
(
&
a
)[
N
],
seq
<
I
...
>
)
{
return
{{
a
[
I
]...}};
}
}
// namespace detail
template
<
class
Result
=
void
,
class
...
Ts
,
MIGRAPHX_REQUIRES
((
sizeof
...(
Ts
)
>
0
))
>
constexpr
std
::
array
<
detail
::
array_type_t
<
Result
,
Ts
...
>
,
sizeof
...(
Ts
)
>
make_array
(
Ts
&&
...
xs
)
{
return
{
static_cast
<
detail
::
array_type_t
<
Result
,
Ts
...
>>
(
std
::
forward
<
Ts
>
(
xs
))...};
}
constexpr
std
::
array
<
int
,
0
>
make_array
()
{
return
{};
}
template
<
class
T
,
std
::
size_t
N
>
constexpr
auto
to_array
(
T
(
&
a
)[
N
])
{
return
detail
::
to_array_impl
(
a
,
detail
::
gens
<
N
>
{});
}
namespace
detail
{
template
<
std
::
size_t
Offset
=
0
,
class
Array
,
std
::
size_t
...
I
>
constexpr
auto
rearray_impl
(
Array
a
,
seq
<
I
...
>
)
{
return
make_array
(
a
[
I
+
Offset
]...);
}
}
// namespace detail
template
<
class
T
,
std
::
size_t
N
>
constexpr
auto
pop_front
(
std
::
array
<
T
,
N
>
a
)
{
return
detail
::
rearray_impl
(
a
,
detail
::
gens
<
N
-
1
>
{});
}
template
<
class
T
,
std
::
size_t
N
>
constexpr
auto
pop_back
(
std
::
array
<
T
,
N
>
a
)
{
return
detail
::
rearray_impl
<
1
>
(
a
,
detail
::
gens
<
N
-
1
>
{});
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/include/migraphx/functional.hpp
View file @
3855c6af
...
...
@@ -15,6 +15,12 @@ struct swallow
}
};
template
<
class
T
>
auto
tuple_size
(
const
T
&
)
{
return
typename
std
::
tuple_size
<
T
>::
type
{};
}
namespace
detail
{
template
<
class
R
,
class
F
>
...
...
@@ -83,6 +89,12 @@ constexpr auto sequence_c(F&& f)
return
detail
::
sequence_c_impl
(
f
,
detail
::
gens
<
N
>
{});
}
template
<
class
IntegerConstant
,
class
F
>
constexpr
auto
sequence
(
IntegerConstant
ic
,
F
&&
f
)
{
return
sequence_c
<
ic
>
(
f
);
}
template
<
class
F
,
class
...
Ts
>
constexpr
void
each_args
(
F
f
,
Ts
&&
...
xs
)
{
...
...
@@ -95,9 +107,9 @@ constexpr void each_args(F)
}
template
<
class
F
,
class
T
>
auto
unpack
(
F
f
,
T
&
x
)
auto
unpack
(
F
f
,
T
&
&
x
)
{
return
sequence
_c
<
std
::
tuple_size
<
T
>
{}
>
(
[
&
](
auto
...
is
)
{
f
(
std
::
get
<
is
>
(
x
)...);
});
return
sequence
(
tuple_size
(
x
),
[
&
](
auto
...
is
)
{
f
(
std
::
get
<
is
>
(
static_cast
<
T
&&>
(
x
)
)...);
});
}
/// Implements a fix-point combinator
...
...
@@ -149,6 +161,35 @@ auto index_of(T& x)
return
[
&
](
auto
&&
y
)
{
return
x
[
y
];
};
}
template
<
class
T
,
class
...
Ts
>
decltype
(
auto
)
front_args
(
T
&&
x
,
Ts
&&
...)
{
return
static_cast
<
T
&&>
(
x
);
}
template
<
class
...
Ts
>
decltype
(
auto
)
back_args
(
Ts
&&
...
xs
)
{
return
std
::
get
<
sizeof
...(
Ts
)
-
1
>
(
std
::
tuple
<
Ts
&&
...
>
(
static_cast
<
Ts
&&>
(
xs
)...));
}
template
<
class
T
,
class
...
Ts
>
auto
pop_front_args
(
T
&&
,
Ts
&&
...
xs
)
{
return
[
&
](
auto
f
)
{
f
(
static_cast
<
Ts
&&>
(
xs
)...);
};
}
template
<
class
...
Ts
>
auto
pop_back_args
(
Ts
&&
...
xs
)
{
return
[
&
](
auto
f
)
{
using
tuple_type
=
std
::
tuple
<
Ts
&&
...
>
;
auto
t
=
tuple_type
(
static_cast
<
Ts
&&>
(
xs
)...);
return
sequence_c
<
sizeof
...(
Ts
)
-
1
>
(
[
&
](
auto
...
is
)
{
return
f
(
std
::
get
<
is
>
(
static_cast
<
tuple_type
&&>
(
t
))...);
});
};
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
...
...
src/include/migraphx/ranges.hpp
View file @
3855c6af
...
...
@@ -33,6 +33,10 @@ auto generic_find_impl(rank<0>, C&& c, const T& x)
return
std
::
find
(
c
.
begin
(),
c
.
end
(),
x
);
}
struct
empty
{
};
}
// namespace detail
template
<
class
C
,
class
T
>
...
...
@@ -71,6 +75,12 @@ bool all_of(const std::initializer_list<T>& c, const Predicate& p)
return
std
::
all_of
(
c
.
begin
(),
c
.
end
(),
p
);
}
template
<
class
Predicate
>
bool
all_of
(
detail
::
empty
,
const
Predicate
&
)
{
return
true
;
}
template
<
class
C
,
class
Predicate
>
bool
any_of
(
const
C
&
c
,
const
Predicate
&
p
)
{
...
...
@@ -83,6 +93,12 @@ bool any_of(const std::initializer_list<T>& c, const Predicate& p)
return
std
::
any_of
(
c
.
begin
(),
c
.
end
(),
p
);
}
template
<
class
Predicate
>
bool
any_of
(
detail
::
empty
,
const
Predicate
&
)
{
return
false
;
}
template
<
class
C
,
class
Predicate
>
bool
none_of
(
const
C
&
c
,
const
Predicate
&
p
)
{
...
...
@@ -95,6 +111,12 @@ bool none_of(const std::initializer_list<T>& c, const Predicate& p)
return
std
::
none_of
(
c
.
begin
(),
c
.
end
(),
p
);
}
template
<
class
Predicate
>
bool
none_of
(
detail
::
empty
,
const
Predicate
&
)
{
return
true
;
}
template
<
class
Range
,
class
Iterator
>
void
copy
(
Range
&&
r
,
Iterator
it
)
{
...
...
src/include/migraphx/raw_data.hpp
View file @
3855c6af
...
...
@@ -212,6 +212,25 @@ auto visit_all(T&& x, Ts&&... xs)
};
}
template
<
class
T
>
auto
visit_all
(
const
std
::
vector
<
T
>&
x
)
{
auto
&&
s
=
x
.
front
().
get_shape
();
if
(
!
std
::
all_of
(
x
.
begin
(),
x
.
end
(),
[
&
](
const
T
&
y
)
{
return
y
.
get_shape
().
type
()
==
s
.
type
();
}))
MIGRAPHX_THROW
(
"Types must be the same"
);
return
[
&
](
auto
v
)
{
s
.
visit_type
([
&
](
auto
as
)
{
using
type
=
typename
decltype
(
as
)
::
type
;
std
::
vector
<
tensor_view
<
type
>>
result
;
std
::
transform
(
x
.
begin
(),
x
.
end
(),
std
::
back_inserter
(
result
),
[
&
](
const
auto
&
y
)
{
return
make_view
(
y
.
get_shape
(),
as
.
from
(
y
.
data
()));
});
v
(
result
);
});
};
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
...
...
src/targets/cpu/lowering.cpp
View file @
3855c6af
...
...
@@ -530,17 +530,26 @@ struct cpu_softmax
std
::
string
name
()
const
{
return
"cpu::softmax"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
template
<
typename
T
>
std
::
size_t
compute_batch_index
(
T
idx
,
shape
&
batch_shape
,
int
axis
)
const
std
::
vector
<
size_t
>
compute_batch_indices
(
size_t
idx
,
const
shape
&
s
)
const
{
idx
[
axis
]
=
0
;
return
batch_shape
.
index
(
idx
);
std
::
vector
<
std
::
size_t
>
indices
(
s
.
lens
().
size
());
std
::
transform
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
s
.
lens
().
begin
(),
indices
.
begin
(),
[
&
](
std
::
size_t
stride
,
std
::
size_t
len
)
{
assert
(
len
>
0
and
stride
>
0
);
return
(
idx
/
stride
)
%
len
;
});
return
indices
;
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
auto
batch_lens
=
output_shape
.
lens
();
size_t
n_dims
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
...
...
@@ -548,26 +557,34 @@ struct cpu_softmax
using
value_type
=
typename
decltype
(
input
)
::
value_type
;
std
::
vector
<
value_type
>
batch_max
(
batch_shape
.
elements
(),
std
::
numeric_limits
<
value_type
>::
lowest
());
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_max
[
index
]
=
std
::
max
(
batch_max
[
index
],
input
(
idx
.
begin
(),
idx
.
end
()));
});
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
auto
idx
=
this
->
compute_batch_indices
(
i
,
batch_shape
);
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
=
std
::
exp
(
input
(
idx
.
begin
(),
idx
.
end
())
-
batch_max
[
index
]
);
});
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
}
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_sum
[
index
]
+=
output
(
idx
.
begin
(),
idx
.
end
());
});
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
size_t
index
=
output_shape
.
index
(
idx
);
output
[
index
]
=
std
::
exp
(
input
[
index
]
-
batch_max
[
i
]);
}
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_sum
[
i
]
+=
output
(
idx
.
begin
(),
idx
.
end
());
}
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
/=
batch_sum
[
index
];
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
output
(
idx
.
begin
(),
idx
.
end
())
/=
batch_sum
[
i
];
}
});
});
...
...
@@ -588,48 +605,65 @@ struct cpu_logsoftmax
std
::
string
name
()
const
{
return
"cpu::logsoftmax"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
template
<
typename
T
>
std
::
size_t
compute_batch_index
(
T
idx
,
const
shape
&
batch_shape
,
int
axis
)
const
std
::
vector
<
size_t
>
compute_batch_indices
(
size_t
idx
,
const
shape
&
s
)
const
{
idx
[
axis
]
=
0
;
return
batch_shape
.
index
(
idx
);
std
::
vector
<
std
::
size_t
>
indices
(
s
.
lens
().
size
());
std
::
transform
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
s
.
lens
().
begin
(),
indices
.
begin
(),
[
&
](
std
::
size_t
stride
,
std
::
size_t
len
)
{
assert
(
len
>
0
and
stride
>
0
);
return
(
idx
/
stride
)
%
len
;
});
return
indices
;
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
auto
batch_lens
=
output_shape
.
lens
();
size_t
n_dims
=
batch_lens
[
op
.
axis
];
batch_lens
[
op
.
axis
]
=
1
;
shape
batch_shape
{
shape
::
int32_type
,
batch_lens
};
// use a parallel implementation to acheive better performance
// one thread for one batch
visit_all
(
result
,
args
[
0
])([
&
](
auto
output
,
auto
input
)
{
using
value_type
=
typename
decltype
(
input
)
::
value_type
;
std
::
vector
<
value_type
>
batch_max
(
batch_shape
.
elements
(),
std
::
numeric_limits
<
value_type
>::
lowest
());
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_max
[
index
]
=
std
::
max
(
batch_max
[
index
],
input
(
idx
.
begin
(),
idx
.
end
()));
});
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
=
input
(
idx
.
begin
(),
idx
.
end
())
-
batch_max
[
index
];
});
par_for
(
batch_shape
.
elements
(),
[
&
](
auto
i
)
{
auto
idx
=
this
->
compute_batch_indices
(
i
,
batch_shape
);
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_max
[
i
]
=
std
::
max
(
batch_max
[
i
],
input
(
idx
.
begin
(),
idx
.
end
()));
}
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_sum
[
index
]
+=
std
::
exp
(
output
(
idx
.
begin
(),
idx
.
end
()));
});
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
size_t
index
=
output_shape
.
index
(
idx
);
output
[
index
]
=
input
[
index
]
-
batch_max
[
i
];
}
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
batch_sum
[
i
]
+=
std
::
exp
(
output
(
idx
.
begin
(),
idx
.
end
()));
}
for
(
std
::
size_t
i
=
0
;
i
<
batch_sum
.
size
();
++
i
)
{
batch_sum
[
i
]
=
std
::
log
(
batch_sum
[
i
]);
}
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
-=
batch_sum
[
index
];
for
(
size_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
idx
[
op
.
axis
]
=
j
;
output
(
idx
.
begin
(),
idx
.
end
())
-=
batch_sum
[
i
];
}
});
});
...
...
src/targets/gpu/device/concat.cpp
View file @
3855c6af
...
...
@@ -10,22 +10,20 @@ namespace gpu {
namespace
device
{
argument
concat
(
hipStream_t
stream
,
const
migraphx
::
shape
&
output_shape
,
const
migraphx
::
shape
&
,
std
::
vector
<
migraphx
::
argument
>
args
,
std
::
vector
<
std
::
size_t
>
offsets
)
{
for
(
std
::
size_t
l
=
0
;
l
<
args
.
size
()
-
1
;
l
++
)
auto
ninputs
=
args
.
size
()
-
1
;
for
(
std
::
size_t
j
=
0
;
j
<
ninputs
;
j
++
)
{
auto
argl
=
args
[
l
];
std
::
size_t
nelements
=
argl
.
get_shape
().
elements
();
visit_all
(
args
.
back
(),
argl
)([
&
](
auto
output
,
auto
input
)
{
visit_tensor_size
(
output_shape
.
lens
().
size
(),
[
&
](
auto
ndim
)
{
auto
*
outptr
=
output
.
data
()
+
offsets
[
l
];
const
auto
*
inptr
=
input
.
data
();
hip_tensor_descriptor
<
ndim
>
desc_input
(
input
.
get_shape
());
hip_tensor_descriptor
<
ndim
>
desc_output
(
output
.
get_shape
());
gs_launch
(
stream
,
nelements
)(
[
=
](
auto
i
)
{
outptr
[
desc_output
.
linear
(
desc_input
.
multi
(
i
))]
=
inptr
[
i
];
});
auto
&&
arg
=
args
[
j
];
std
::
size_t
nelements
=
arg
.
get_shape
().
elements
();
auto
offset
=
offsets
[
j
];
hip_visit_all
(
args
.
back
(),
arg
)([
&
](
auto
output
,
auto
input
)
{
gs_launch
(
stream
,
nelements
)([
=
](
auto
i
)
{
auto
idx
=
output
.
get_shape
().
index
(
input
.
get_shape
().
multi
(
i
));
output
.
data
()[
idx
+
offset
]
=
input
.
data
()[
i
];
});
});
}
...
...
src/targets/gpu/device/gather.cpp
View file @
3855c6af
...
...
@@ -11,35 +11,30 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
argument
gather
(
hipStream_t
stream
,
const
migraphx
::
shape
&
output_shape
,
std
::
vector
<
migraphx
::
argument
>
args
,
int
axis
)
argument
gather
(
hipStream_t
stream
,
argument
result
,
argument
arg1
,
argument
arg2
,
int
axis
)
{
auto
axis_index
=
(
axis
<
0
)
?
(
axis
+
args
[
0
].
get_shape
().
lens
().
size
())
:
axis
;
visit_all
(
args
.
back
(),
args
[
0
])([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
output_shape
.
elements
();
args
[
1
].
visit
([
&
](
auto
indices
)
{
const
auto
*
indices_ptr
=
device_cast
(
indices
.
data
());
auto
*
out_ptr
=
device_cast
(
output
.
data
());
const
auto
*
in_ptr
=
device_cast
(
input
.
data
());
auto
&
input_shape
=
args
[
0
].
get_shape
();
auto
lens
=
input_shape
.
lens
();
lens
[
axis_index
]
=
args
[
1
].
get_shape
().
elements
();
migraphx
::
shape
out_comp_shape
{
output_shape
.
type
(),
lens
};
visit_tensor_size
(
out_comp_shape
.
lens
().
size
(),
[
&
](
auto
n_out_dim
)
{
hip_tensor_descriptor
<
n_out_dim
>
desc_input
(
input_shape
);
hip_tensor_descriptor
<
n_out_dim
>
desc_output
(
out_comp_shape
);
gs_launch
(
stream
,
nelements
)([
=
](
auto
ii
)
{
auto
in_idx
=
desc_output
.
multi
(
ii
);
in_idx
[
axis_index
]
=
indices_ptr
[
in_idx
[
axis_index
]];
out_ptr
[
ii
]
=
in_ptr
[
desc_input
.
linear
(
in_idx
)];
auto
axis_index
=
(
axis
<
0
)
?
(
axis
+
arg1
.
get_shape
().
lens
().
size
())
:
axis
;
auto
&
input_shape
=
arg1
.
get_shape
();
auto
lens
=
input_shape
.
lens
();
lens
[
axis_index
]
=
arg2
.
get_shape
().
elements
();
shape
out_comp_shape
{
result
.
get_shape
().
type
(),
lens
};
std
::
size_t
nelements
=
result
.
get_shape
().
elements
();
visit_all
(
result
,
arg1
)([
&
](
auto
output
,
auto
input_v
)
{
hip_visit_views
(
input_v
,
out_comp_shape
)([
&
](
auto
input
,
auto
out_comp
)
{
arg2
.
visit
([
&
](
auto
indices
)
{
const
auto
*
indices_ptr
=
device_cast
(
indices
.
data
());
auto
*
output_ptr
=
device_cast
(
output
.
data
());
gs_launch
(
stream
,
nelements
)([
=
](
auto
i
)
{
auto
idx
=
out_comp
.
multi
(
i
);
idx
[
axis_index
]
=
indices_ptr
[
idx
[
axis_index
]];
output_ptr
[
i
]
=
input
[
idx
];
});
});
});
});
return
args
.
back
()
;
return
result
;
}
}
// namespace device
...
...
src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
0 → 100644
View file @
3855c6af
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
T
,
std
::
size_t
N
>
struct
hip_array
{
T
d
[
N
];
MIGRAPHX_DEVICE_CONSTEXPR
T
&
operator
[](
std
::
size_t
i
)
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
operator
[](
std
::
size_t
i
)
const
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
data
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
data
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
integral_constant
<
std
::
size_t
,
N
>
size
()
const
{
return
{};
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
begin
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
begin
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
end
()
{
return
d
+
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
end
()
const
{
return
d
+
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
T
dot
(
const
hip_array
&
x
)
const
{
T
result
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
N
;
i
++
)
result
+=
x
[
i
]
*
d
[
i
];
return
result
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
product
()
const
{
T
result
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
N
;
i
++
)
result
*=
d
[
i
];
return
result
;
}
friend
MIGRAPHX_DEVICE_CONSTEXPR
hip_array
operator
*
(
const
hip_array
&
x
,
const
hip_array
&
y
)
{
hip_array
result
;
for
(
std
::
size_t
i
=
0
;
i
<
N
;
i
++
)
result
[
i
]
=
x
[
i
]
*
y
[
i
];
return
result
;
}
};
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
View file @
3855c6af
This diff is collapsed.
Click to expand it.
src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
0 → 100644
View file @
3855c6af
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
#include <migraphx/gpu/device/array.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
std
::
size_t
N
>
struct
hip_shape
{
using
hip_index
=
hip_array
<
std
::
size_t
,
N
>
;
hip_array
<
std
::
size_t
,
N
>
lens
=
{};
hip_array
<
std
::
size_t
,
N
>
strides
=
{};
bool
standard
=
false
;
__device__
__host__
hip_shape
()
=
default
;
hip_shape
(
const
shape
&
s
)
:
standard
(
s
.
standard
())
{
assert
(
s
.
lens
().
size
()
==
N
);
assert
(
s
.
strides
().
size
()
==
N
);
std
::
copy
(
s
.
lens
().
begin
(),
s
.
lens
().
end
(),
lens
.
begin
());
std
::
copy
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
strides
.
begin
());
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
elements
()
const
{
return
lens
.
product
();
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
index
(
hip_index
x
)
const
{
return
x
.
dot
(
strides
);
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
index
(
std
::
initializer_list
<
std
::
size_t
>
x
)
const
{
std
::
size_t
idx
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
x
.
size
();
i
++
)
idx
+=
*
(
x
.
begin
()
+
i
)
*
strides
[
i
];
return
idx
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
index
(
std
::
size_t
i
)
const
{
if
(
this
->
standard
)
return
i
;
else
{
const
std
::
size_t
rank
=
this
->
lens
.
size
();
std
::
size_t
s
=
1
;
std
::
size_t
result
=
0
;
for
(
std
::
size_t
j
=
0
;
j
<
this
->
lens
.
size
();
j
++
)
{
const
std
::
size_t
k
=
rank
-
j
-
1
;
const
std
::
size_t
stride
=
this
->
strides
[
k
];
const
std
::
size_t
len
=
this
->
lens
[
k
];
const
std
::
size_t
slen
=
s
*
len
;
const
std
::
size_t
idx
=
(
i
%
slen
)
/
s
;
result
+=
stride
*
idx
;
s
=
slen
;
}
return
result
;
}
}
MIGRAPHX_DEVICE_CONSTEXPR
hip_index
multi
(
std
::
size_t
idx
)
const
{
hip_index
result
;
std
::
size_t
tidx
=
idx
;
for
(
std
::
size_t
is
=
0
;
is
<
result
.
size
();
is
++
)
{
result
[
is
]
=
tidx
/
strides
[
is
];
tidx
=
tidx
%
strides
[
is
];
}
return
result
;
}
};
template
<
std
::
size_t
N
>
hip_shape
<
N
>
make_hip_shape
(
const
shape
&
x
)
{
return
x
;
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
View file @
3855c6af
#ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
#include <hip/hip_runtime.h>
#include <migraphx/functional.hpp>
#include <migraphx/config.hpp>
#include <migraphx/gpu/device/visit.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
F
>
void
visit_tensor_size
(
std
::
size_t
n
,
F
f
)
{
switch
(
n
)
{
case
1
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
1
>
{});
break
;
}
case
2
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
2
>
{});
break
;
}
case
3
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
3
>
{});
break
;
}
case
4
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
4
>
{});
break
;
}
case
5
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
5
>
{});
break
;
}
default:
throw
std
::
runtime_error
(
"Unknown tensor size"
);
}
}
template
<
size_t
NDim
>
struct
hip_index
{
size_t
d
[
NDim
];
__device__
__host__
size_t
&
operator
[](
size_t
i
)
{
return
d
[
i
];
}
__device__
__host__
size_t
operator
[](
size_t
i
)
const
{
return
d
[
i
];
}
};
template
<
std
::
size_t
NDim
>
using
hip_tensor_index
=
hip_array
<
std
::
size_t
,
NDim
>
;
template
<
size_t
NDim
>
template
<
std
::
size_t
NDim
>
struct
hip_tensor_descriptor
{
__device__
__host__
hip_tensor_descriptor
()
=
default
;
...
...
@@ -63,26 +22,26 @@ struct hip_tensor_descriptor
std
::
copy
(
s
.
strides
().
begin
(),
s
.
strides
().
end
(),
strides
);
}
__device__
__host__
hip_index
<
NDim
>
multi
(
size_t
idx
)
const
__device__
__host__
hip_
tensor_
index
<
NDim
>
multi
(
std
::
size_t
idx
)
const
{
hip_index
<
NDim
>
result
{};
size_t
tidx
=
idx
;
for
(
size_t
is
=
0
;
is
<
NDim
;
is
++
)
hip_
tensor_
index
<
NDim
>
result
{};
std
::
size_t
tidx
=
idx
;
for
(
std
::
size_t
is
=
0
;
is
<
NDim
;
is
++
)
{
result
[
is
]
=
tidx
/
strides
[
is
];
tidx
=
tidx
%
strides
[
is
];
}
return
result
;
}
__device__
__host__
size_t
linear
(
hip_index
<
NDim
>
s
)
const
__device__
__host__
std
::
size_t
linear
(
hip_
tensor_
index
<
NDim
>
s
)
const
{
size_t
idx
=
0
;
for
(
size_t
i
=
0
;
i
<
NDim
;
i
++
)
std
::
size_t
idx
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
NDim
;
i
++
)
idx
+=
s
[
i
]
*
strides
[
i
];
return
idx
;
}
size_t
lens
[
NDim
]
=
{};
size_t
strides
[
NDim
]
=
{};
std
::
size_t
lens
[
NDim
]
=
{};
std
::
size_t
strides
[
NDim
]
=
{};
};
}
// namespace device
...
...
src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
0 → 100644
View file @
3855c6af
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
#include <migraphx/gpu/device/shape.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
T
,
std
::
size_t
N
>
struct
hip_tensor_view
{
using
value_type
=
T
;
using
hip_index
=
typename
hip_shape
<
N
>::
hip_index
;
__device__
__host__
hip_tensor_view
()
=
default
;
__host__
hip_tensor_view
(
tensor_view
<
T
>
x
)
:
d
(
x
.
data
()),
s
(
x
.
get_shape
())
{}
__host__
hip_tensor_view
(
T
*
x
,
const
shape
&
ss
)
:
d
(
x
),
s
(
ss
)
{}
MIGRAPHX_DEVICE_CONSTEXPR
const
hip_shape
<
N
>&
get_shape
()
const
{
return
s
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
size
()
const
{
return
s
.
elements
();
}
MIGRAPHX_DEVICE_CONSTEXPR
value_type
*
data
()
const
{
return
d
;
}
template
<
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
value_type
&
operator
[](
U
i
)
const
{
return
d
[
s
.
index
(
i
)];
}
MIGRAPHX_DEVICE_CONSTEXPR
value_type
*
begin
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
value_type
*
end
()
const
{
return
d
+
size
();
}
private:
value_type
*
d
=
nullptr
;
hip_shape
<
N
>
s
{};
};
template
<
std
::
size_t
N
,
class
T
>
hip_tensor_view
<
T
,
N
>
make_hip_view
(
const
shape
&
s
,
T
*
x
)
{
return
{
x
,
s
};
}
template
<
std
::
size_t
N
,
class
T
>
hip_tensor_view
<
T
,
N
>
make_hip_view
(
tensor_view
<
T
>
x
)
{
return
{
x
};
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
View file @
3855c6af
...
...
@@ -8,14 +8,45 @@
#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
#include <hip/hip_runtime.h>
#include <migraphx/half.hpp>
#include <migraphx/config.hpp>
#include <migraphx/tensor_view.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
template
<
class
T
,
std
::
size_t
N
>
using
vec
=
T
__attribute__
((
ext_vector_type
(
N
)));
template
<
std
::
size_t
N
,
class
T
>
__device__
__host__
T
*
as_pointer
(
vec
<
T
,
N
>*
x
)
{
return
reinterpret_cast
<
T
*>
(
x
);
}
template
<
std
::
size_t
N
,
class
T
>
__device__
__host__
vec
<
T
,
N
>*
as_vec
(
T
*
x
)
{
return
reinterpret_cast
<
vec
<
T
,
N
>*>
(
x
);
}
template
<
std
::
size_t
N
,
class
T
>
tensor_view
<
vec
<
T
,
N
>>
as_vec
(
tensor_view
<
T
>
x
)
{
return
{
x
.
get_shape
(),
as_vec
<
N
>
(
x
.
data
())};
}
template
<
std
::
size_t
N
,
class
...
Ts
>
auto
pack_vec
(
Ts
...
xs
)
{
return
[
=
](
auto
f
,
std
::
size_t
n
)
{
return
f
(
as_vec
<
N
>
(
xs
)[
n
]...);
};
}
using
gpu_half
=
__fp16
;
namespace
detail
{
...
...
@@ -25,6 +56,12 @@ struct device_type
using
type
=
T
;
};
template
<
class
T
,
std
::
size_t
N
>
struct
device_type
<
vec
<
T
,
N
>>
{
using
type
=
vec
<
typename
device_type
<
T
>::
type
,
N
>
;
};
template
<
>
struct
device_type
<
half
>
{
...
...
@@ -38,7 +75,7 @@ struct host_type
};
template
<
>
struct
device
_type
<
gpu_half
>
struct
host
_type
<
gpu_half
>
{
using
type
=
half
;
};
...
...
@@ -75,6 +112,12 @@ device_type<T>* device_cast(T* x)
return
reinterpret_cast
<
device_type
<
T
>*>
(
x
);
}
template
<
class
T
>
tensor_view
<
device_type
<
T
>>
device_cast
(
tensor_view
<
T
>
x
)
{
return
{
x
.
get_shape
(),
reinterpret_cast
<
device_type
<
T
>*>
(
x
.
data
())};
}
template
<
class
T
>
__device__
__host__
T
to_hip_type
(
T
x
)
{
...
...
src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
0 → 100644
View file @
3855c6af
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
#include <migraphx/gpu/device/types.hpp>
#include <vector>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
T
,
std
::
size_t
N
>
struct
hip_vector
{
MIGRAPHX_DEVICE_CONSTEXPR
hip_vector
()
=
default
;
MIGRAPHX_DEVICE_CONSTEXPR
hip_vector
(
std
::
size_t
s
)
:
len
(
s
)
{}
template
<
class
Iterator
>
__device__
__host__
hip_vector
(
Iterator
start
,
Iterator
last
)
{
auto
it
=
std
::
copy
(
start
,
last
,
d
);
len
=
std
::
distance
(
d
,
it
);
}
__device__
__host__
hip_vector
(
std
::
initializer_list
<
T
>
x
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
d
);
len
=
x
.
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
T
&
operator
[](
std
::
size_t
i
)
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
operator
[](
std
::
size_t
i
)
const
{
return
d
[
i
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
&
front
()
{
return
d
[
0
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
front
()
const
{
return
d
[
0
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
&
back
()
{
return
d
[
size
()
-
1
];
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
&
back
()
const
{
return
d
[
size
()
-
1
];
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
data
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
data
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
std
::
size_t
size
()
const
{
return
len
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
begin
()
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
begin
()
const
{
return
d
;
}
MIGRAPHX_DEVICE_CONSTEXPR
T
*
end
()
{
return
d
+
size
();
}
MIGRAPHX_DEVICE_CONSTEXPR
const
T
*
end
()
const
{
return
d
+
size
();
}
template
<
class
U
>
MIGRAPHX_DEVICE_CONSTEXPR
void
push_back
(
U
&&
x
)
{
d
[
len
]
=
static_cast
<
U
&&>
(
x
);
len
++
;
}
private:
T
d
[
N
]
=
{};
std
::
size_t
len
=
0
;
};
template
<
std
::
size_t
N
,
class
T
>
hip_vector
<
T
,
N
>
to_hip_vector
(
const
std
::
vector
<
T
>&
x
)
{
hip_vector
<
T
,
N
>
result
(
x
.
size
());
std
::
copy
(
x
.
begin
(),
x
.
end
(),
result
.
begin
());
return
result
;
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
0 → 100644
View file @
3855c6af
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
#include <migraphx/gpu/device/tensor_view.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
F
>
void
visit_tensor_size
(
std
::
size_t
n
,
F
f
)
{
switch
(
n
)
{
case
1
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
1
>
{});
break
;
}
case
2
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
2
>
{});
break
;
}
case
3
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
3
>
{});
break
;
}
case
4
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
4
>
{});
break
;
}
case
5
:
{
f
(
std
::
integral_constant
<
std
::
size_t
,
5
>
{});
break
;
}
default:
throw
std
::
runtime_error
(
"Unknown tensor size"
);
}
}
inline
shape
get_shape
(
const
shape
&
x
)
{
return
x
;
}
template
<
class
T
>
auto
get_shape
(
const
T
&
x
)
->
decltype
(
x
.
get_shape
())
{
return
x
.
get_shape
();
}
template
<
class
V
,
class
F
,
class
...
Ts
>
void
hip_visit_all_impl
(
const
shape
&
s
,
F
f
,
V
&&
v
,
Ts
&&
...
xs
)
{
std
::
initializer_list
<
migraphx
::
shape
::
type_t
>
types
=
{
get_shape
(
xs
).
type
()...};
if
(
!
std
::
all_of
(
types
.
begin
(),
types
.
end
(),
[
&
](
migraphx
::
shape
::
type_t
t
)
{
return
t
==
s
.
type
();
}))
MIGRAPHX_THROW
(
"Types must be the same"
);
std
::
initializer_list
<
std
::
size_t
>
ranks
=
{
get_shape
(
xs
).
lens
().
size
()...};
if
(
!
std
::
all_of
(
ranks
.
begin
(),
ranks
.
end
(),
[
&
](
std
::
size_t
r
)
{
return
r
==
s
.
lens
().
size
();
}))
MIGRAPHX_THROW
(
"Ranks must be the same"
);
visit_tensor_size
(
s
.
lens
().
size
(),
[
&
](
auto
ndim
)
{
s
.
visit_type
([
&
](
auto
as
)
{
v
(
f
(
xs
,
ndim
,
as
)...);
});
});
}
template
<
class
V
,
class
F
,
class
...
Ts
>
void
hip_visit_views_impl
(
const
shape
&
s
,
F
f
,
V
&&
v
,
Ts
&&
...
xs
)
{
std
::
initializer_list
<
std
::
size_t
>
ranks
=
{
get_shape
(
xs
).
lens
().
size
()...};
if
(
!
std
::
all_of
(
ranks
.
begin
(),
ranks
.
end
(),
[
&
](
std
::
size_t
r
)
{
return
r
==
s
.
lens
().
size
();
}))
MIGRAPHX_THROW
(
"Ranks must be the same"
);
visit_tensor_size
(
s
.
lens
().
size
(),
[
&
](
auto
ndim
)
{
v
(
f
(
xs
,
ndim
)...);
});
}
template
<
class
F
>
struct
hip_convert
{
F
f
;
template
<
class
RawData
,
class
N
,
class
As
>
auto
operator
()(
RawData
x
,
N
ndim
,
As
as
)
const
->
decltype
(
make_hip_view
<
ndim
>
(
x
.
get_shape
(),
f
(
as
.
from
(
x
.
data
()))))
{
return
make_hip_view
<
ndim
>
(
x
.
get_shape
(),
f
(
as
.
from
(
x
.
data
())));
}
template
<
class
N
,
class
As
>
auto
operator
()(
const
shape
&
s
,
N
ndim
,
As
)
const
{
return
make_hip_shape
<
ndim
>
(
s
);
}
};
template
<
class
F
>
hip_convert
<
F
>
make_hip_convert
(
F
f
)
{
return
{
f
};
}
template
<
class
F
>
struct
hip_convert_view
{
F
f
;
template
<
class
T
,
class
N
>
auto
operator
()(
tensor_view
<
T
>
x
,
N
ndim
)
const
{
return
make_hip_view
<
ndim
>
(
f
(
x
));
}
template
<
class
N
>
auto
operator
()(
const
shape
&
s
,
N
ndim
)
const
{
return
make_hip_shape
<
ndim
>
(
s
);
}
};
template
<
class
F
>
hip_convert_view
<
F
>
make_hip_convert_view
(
F
f
)
{
return
{
f
};
}
template
<
class
T
,
class
...
Ts
>
auto
hip_visit_all
(
T
&&
x
,
Ts
&&
...
xs
)
{
return
[
&
](
auto
f
)
{
hip_visit_all_impl
(
get_shape
(
x
),
make_hip_convert
([](
auto
*
p
)
{
return
device_cast
(
p
);
}),
f
,
x
,
xs
...);
};
}
template
<
std
::
size_t
N
,
class
T
,
class
...
Ts
>
auto
hip_vec_visit_all
(
T
&&
x
,
Ts
&&
...
xs
)
{
return
[
&
](
auto
f
)
{
hip_visit_all_impl
(
get_shape
(
x
),
make_hip_convert
([](
auto
*
p
)
{
return
as_vec
<
N
>
(
device_cast
(
p
));
}),
f
,
x
,
xs
...);
};
}
template
<
class
T
,
class
...
Ts
>
auto
hip_pointer_visit_all
(
T
&&
x
,
Ts
&&
...
xs
)
{
return
[
&
](
auto
f
)
{
visit_all
(
x
,
xs
...)([
&
](
auto
...
vs
)
{
f
(
device_cast
(
vs
.
data
())...);
});
};
}
template
<
class
T
,
class
...
Ts
>
auto
hip_visit_views
(
T
&&
x
,
Ts
&&
...
xs
)
{
return
[
&
](
auto
f
)
{
hip_visit_views_impl
(
get_shape
(
x
),
make_hip_convert_view
([](
auto
v
)
{
return
device_cast
(
v
);
}),
f
,
x
,
xs
...);
};
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/logsoftmax.cpp
View file @
3855c6af
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/logsoftmax.hpp>
#include <migraphx/gpu/device/reduce_opers.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
...
...
@@ -11,127 +12,82 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
argument
logsoftmax
(
hipStream_t
stream
,
const
migraphx
::
shape
&
output_shape
,
std
::
vector
<
migraphx
::
argument
>
args
,
int
axis
)
void
logsoftmax
(
hipStream_t
stream
,
argument
result
,
argument
arg
,
int
axis
)
{
auto
lens
=
output_shape
.
lens
();
auto
num_in_batch
=
lens
[
axis
];
auto
batch_lens
=
lens
;
batch_lens
[
axis
]
=
1
;
migraphx
::
shape
batch_shape
{
output_shape
.
type
(),
batch_lens
};
visit_all
(
args
.
back
(),
args
.
front
())([
&
](
auto
output
,
auto
input
)
{
const
auto
*
input_ptr
=
device_cast
(
input
.
data
());
auto
*
output_ptr
=
device_cast
(
output
.
data
());
visit_tensor_size
(
batch_shape
.
lens
().
size
(),
[
&
](
auto
n_dim
)
{
hip_tensor_descriptor
<
n_dim
>
desc_batch
(
batch_shape
);
hip_tensor_descriptor
<
n_dim
>
desc_data
(
output_shape
);
// use one block for items in one batch.
// opt 1, load all data to lds then use the same approach as
// the current optimization
const
size_t
block_size
=
1024
;
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
size_t
thr_idx
=
idx
.
local
;
size_t
blk_idx
=
idx
.
group
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
// all data can be loaded to the lds once, so all operations are
// done in lds
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
block_size
+
2
];
auto
batch_idx
=
desc_batch
.
multi
(
blk_idx
);
auto
data_idx
=
batch_idx
;
// load data to lds and compute the batch max
size_t
item_num
=
num_in_batch
;
lds_data
[
block_size
]
=
input_ptr
[
0
];
for
(
size_t
i
=
thr_idx
;
i
<
num_in_batch
;
i
+=
block_size
)
auto
lens
=
result
.
get_shape
().
lens
();
auto
batch_item_num
=
lens
[
axis
];
auto
batch_lens
=
lens
;
batch_lens
[
axis
]
=
1
;
migraphx
::
shape
batch_shape
{
result
.
get_shape
().
type
(),
batch_lens
};
hip_visit_all
(
result
,
arg
,
batch_shape
)([
&
](
auto
output
,
auto
input
,
auto
batch
)
{
// use one block for items in one batch.
const
size_t
max_block_size
=
1024
;
size_t
block_size
=
1
;
while
(
block_size
<
max_block_size
and
block_size
<
batch_item_num
)
{
block_size
*=
2
;
}
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
size_t
thr_idx
=
idx
.
local
;
size_t
blk_idx
=
idx
.
group
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
max_block_size
+
1
];
auto
batch_idx
=
batch
.
multi
(
blk_idx
);
auto
data_idx
=
batch_idx
;
// load data to lds and compute the batch max
size_t
remaining_item_num
=
batch_item_num
;
size_t
round_item_num
=
(
batch_item_num
+
block_size
-
1
)
/
block_size
*
block_size
;
lds_data
[
block_size
]
=
input
[
0
];
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
if
(
i
<
batch_item_num
)
{
data_idx
[
axis
]
=
i
;
lds_data
[
i
]
=
input_ptr
[
desc_data
.
linear
(
data_idx
)];
__syncthreads
();
auto
size
=
(
item_num
>
block_size
)
?
block_size
:
item_num
;
auto
stride
=
(
size
+
1
)
/
2
;
while
(
true
)
{
if
(
thr_idx
+
stride
<
size
)
{
lds_data
[
thr_idx
]
=
::
max
(
to_hip_type
(
lds_data
[
thr_idx
]),
to_hip_type
(
lds_data
[
thr_idx
+
stride
]));
}
__syncthreads
();
size
=
stride
;
stride
=
(
stride
+
1
)
/
2
;
if
(
size
==
1
)
break
;
}
if
(
thr_idx
==
0
)
{
lds_data
[
block_size
]
=
(
lds_data
[
0
]
<
lds_data
[
block_size
])
?
lds_data
[
block_size
]
:
lds_data
[
0
];
}
__syncthreads
();
item_num
-=
block_size
;
data_idx
[
axis
]
=
i
;
lds_data
[
thr_idx
]
=
input
[
data_idx
];
}
__syncthreads
();
const
size_t
block_size1
=
block_size
+
1
;
lds_data
[
block_size1
]
=
0
;
item_num
=
num_in_batch
;
for
(
size_t
i
=
thr_idx
;
i
<
num_in_batch
;
i
+=
block_size
)
{
data_idx
[
axis
]
=
i
;
lds_data
[
i
]
=
input_ptr
[
desc_data
.
linear
(
data_idx
)]
-
lds_data
[
block_size
];
lds_data
[
i
]
=
::
exp
(
to_hip_type
(
lds_data
[
i
]));
__syncthreads
();
auto
size
=
(
item_num
>
block_size
)
?
block_size
:
item_num
;
auto
stride
=
(
size
+
1
)
/
2
;
while
(
true
)
{
if
(
thr_idx
+
stride
<
size
)
{
lds_data
[
thr_idx
]
+=
lds_data
[
thr_idx
+
stride
];
}
__syncthreads
();
size
=
stride
;
stride
=
(
stride
+
1
)
/
2
;
if
(
size
==
1
)
break
;
}
if
(
thr_idx
==
0
)
{
lds_data
[
block_size1
]
+=
lds_data
[
0
];
}
__syncthreads
();
item_num
-=
block_size
;
}
auto
item_num
=
(
remaining_item_num
>
block_size
)
?
block_size
:
remaining_item_num
;
reduce_max
(
lds_data
,
block_size
,
thr_idx
,
item_num
);
auto
log_batch_sum
=
::
log
(
to_hip_type
(
lds_data
[
block_size1
]))
+
lds_data
[
block_size
];
item_num
=
num_in_batch
;
for
(
size_t
i
=
thr_idx
;
i
<
num_in_batch
;
i
+=
block_size
)
remaining_item_num
-=
block_size
;
}
auto
batch_max
=
lds_data
[
block_size
];
__syncthreads
();
lds_data
[
block_size
]
=
0
;
remaining_item_num
=
batch_item_num
;
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
if
(
i
<
batch_item_num
)
{
data_idx
[
axis
]
=
i
;
size_t
index
=
desc_data
.
linear
(
data_idx
)
;
output_ptr
[
index
]
=
input_ptr
[
index
]
-
log_batch_sum
;
lds_data
[
thr_idx
]
=
input
[
data_idx
]
-
batch_max
;
lds_data
[
thr_idx
]
=
::
exp
(
to_hip_type
(
lds_data
[
thr_idx
]))
;
}
});
__syncthreads
();
auto
item_num
=
(
remaining_item_num
>
block_size
)
?
block_size
:
remaining_item_num
;
reduce_sum
(
lds_data
,
block_size
,
thr_idx
,
item_num
);
remaining_item_num
-=
block_size
;
}
auto
log_batch_sum
=
::
log
(
to_hip_type
(
lds_data
[
block_size
]))
+
batch_max
;
for
(
size_t
i
=
thr_idx
;
i
<
batch_item_num
;
i
+=
block_size
)
{
data_idx
[
axis
]
=
i
;
output
[
data_idx
]
=
input
[
data_idx
]
-
log_batch_sum
;
}
});
});
return
args
.
back
();
}
}
// namespace device
...
...
src/targets/gpu/device/pad.cpp
View file @
3855c6af
...
...
@@ -15,33 +15,26 @@ argument
pad
(
hipStream_t
stream
,
argument
result
,
argument
arg1
,
float
value
,
std
::
vector
<
std
::
int64_t
>
pads
)
{
std
::
size_t
nelements
=
arg1
.
get_shape
().
elements
();
visit_all
(
result
)([
&
](
auto
output
)
{
auto
*
outptr
=
device_cast
(
output
.
data
())
;
using
type
=
typename
decltype
(
output
)
::
value_type
;
device_type
<
type
>
device_val
=
value
;
hip_
visit_all
(
result
,
arg1
)([
&
](
auto
output
,
auto
input
)
{
using
type
=
typename
decltype
(
output
)
::
value_type
;
using
hip_index
=
typename
decltype
(
output
)
::
hip_index
;
type
device_val
=
value
;
if
(
float_equal
(
value
,
std
::
numeric_limits
<
float
>::
lowest
()))
{
device_val
=
device_cast
(
std
::
numeric_limits
<
type
>::
lowest
());
}
gs_launch
(
stream
,
result
.
get_shape
().
elements
())([
=
](
auto
i
)
{
outptr
[
i
]
=
device_val
;
});
});
gs_launch
(
stream
,
result
.
get_shape
().
elements
())([
=
](
auto
i
)
{
output
.
data
()[
i
]
=
device_val
;
});
visit_all
(
result
,
arg1
)([
&
](
auto
output
,
auto
input
)
{
visit_tensor_size
(
result
.
get_shape
().
lens
().
size
(),
[
&
](
auto
ndim
)
{
std
::
size_t
offsets
[
ndim
];
std
::
copy
(
pads
.
begin
(),
pads
.
begin
()
+
ndim
,
offsets
);
auto
*
outptr
=
output
.
data
();
const
auto
*
inptr
=
input
.
data
();
hip_tensor_descriptor
<
ndim
>
desc_input
(
input
.
get_shape
());
hip_tensor_descriptor
<
ndim
>
desc_output
(
output
.
get_shape
());
gs_launch
(
stream
,
nelements
)([
=
](
auto
i
)
{
auto
idx
=
desc_input
.
multi
(
i
);
for
(
std
::
size_t
j
=
0
;
j
<
ndim
;
j
++
)
{
idx
[
j
]
+=
offsets
[
j
];
}
outptr
[
desc_output
.
linear
(
idx
)]
=
inptr
[
i
];
});
hip_index
offsets
;
std
::
copy
(
pads
.
begin
(),
pads
.
begin
()
+
offsets
.
size
(),
offsets
.
begin
());
gs_launch
(
stream
,
nelements
)([
=
](
auto
i
)
{
auto
idx
=
input
.
get_shape
().
multi
(
i
);
for
(
std
::
size_t
j
=
0
;
j
<
offsets
.
size
();
j
++
)
{
idx
[
j
]
+=
offsets
[
j
];
}
output
[
idx
]
=
input
.
data
()[
i
];
});
});
return
result
;
...
...
src/targets/gpu/device/softmax.cpp
View file @
3855c6af
...
...
@@ -2,6 +2,7 @@
#include <migraphx/argument.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/gpu/device/softmax.hpp>
#include <migraphx/gpu/device/reduce_opers.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
...
...
@@ -12,122 +13,82 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
argument
softmax
(
hipStream_t
stream
,
const
migraphx
::
shape
&
output_shape
,
std
::
vector
<
migraphx
::
argument
>
args
,
int
axis
)
void
softmax
(
hipStream_t
stream
,
argument
result
,
argument
arg
,
int
axis
)
{
auto
lens
=
outpu
t_shape
.
lens
();
auto
batch_lens
=
lens
;
size_t
n_dims
=
lens
[
axis
];
batch_lens
[
axis
]
=
1
;
migraphx
::
shape
batch_shape
{
shape
::
int32_
type
,
batch_lens
};
visit_all
(
args
.
back
(),
args
.
front
()
)([
&
](
auto
output
,
auto
input
)
{
const
auto
*
input_ptr
=
device_cast
(
input
.
data
());
auto
*
output_ptr
=
device_cast
(
output
.
data
())
;
visit_tensor_size
(
batch_shape
.
lens
().
size
(),
[
&
](
auto
n_dim
)
{
hip_tensor_descriptor
<
n_dim
>
desc_batch
(
batch_shape
);
hip_tensor_descriptor
<
n_dim
>
desc_data
(
output_shape
);
// use one block for items in one batch.
const
size_t
block_size
=
1024
;
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
size_t
thr
_idx
=
idx
.
local
;
size_t
blk_idx
=
idx
.
group
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
// all data can be loaded to the lds once, so all operations are
// done in lds
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
block_size
+
2
];
auto
batch_idx
=
desc_batch
.
multi
(
blk_idx
)
;
auto
data_idx
=
batch_idx
;
// load data to lds and compute the batch max
size_t
i
tem_num
=
n_dims
;
lds_data
[
block_size
]
=
input_ptr
[
0
];
for
(
size_t
i
=
thr_idx
;
i
<
n_dims
;
i
+=
block_size
)
auto
lens
=
result
.
ge
t_shape
()
.
lens
();
auto
batch_lens
=
lens
;
size_t
batch_item_num
=
lens
[
axis
];
batch_lens
[
axis
]
=
1
;
migraphx
::
shape
batch_shape
{
result
.
get_shape
().
type
()
,
batch_lens
};
hip_
visit_all
(
result
,
arg
,
batch_shape
)([
&
](
auto
output
,
auto
input
,
auto
batch
)
{
// use one block for items in one batch.
const
size_t
max_block_size
=
1024
;
size_t
block_size
=
1
;
while
(
block_size
<
max_block_size
and
block_size
<
batch_item_num
)
{
block_size
*=
2
;
}
launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
idx
)
__device__
{
size_t
thr_idx
=
idx
.
local
;
size_t
blk
_idx
=
idx
.
group
;
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>>
;
MIGRAPHX_DEVICE_SHARED
type
lds_data
[
max_block_size
+
1
];
auto
batch_idx
=
batch
.
multi
(
blk_idx
);
auto
data_idx
=
batch_idx
;
// load data to lds and compute the batch max
size_t
remaining_item_num
=
batch_item_num
;
size_t
round_item_num
=
(
batch_item_num
+
block_size
-
1
)
/
block_size
*
block_size
;
lds_data
[
block_size
]
=
input
[
0
];
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
if
(
i
<
batch_item_num
)
{
data_idx
[
axis
]
=
i
;
lds_data
[
i
]
=
input_ptr
[
desc_data
.
linear
(
data_idx
)];
__syncthreads
();
data_idx
[
axis
]
=
i
;
lds_data
[
thr_idx
]
=
input
[
data_idx
];
}
auto
size
=
(
item_num
>
block_size
)
?
block_size
:
item_num
;
auto
stride
=
(
size
+
1
)
/
2
;
while
(
true
)
{
if
(
thr_idx
+
stride
<
size
)
{
lds_data
[
thr_idx
]
=
::
max
(
to_hip_type
(
lds_data
[
thr_idx
]),
to_hip_type
(
lds_data
[
thr_idx
+
stride
]));
}
__syncthreads
();
size
=
stride
;
stride
=
(
stride
+
1
)
/
2
;
__syncthreads
();
if
(
size
==
1
)
break
;
}
auto
item_num
=
(
remaining_item_num
>
block_size
)
?
block_size
:
remaining_item_num
;
reduce_max
(
lds_data
,
block_size
,
thr_idx
,
item_num
);
if
(
thr_idx
==
0
)
{
lds_data
[
block_size
]
=
(
lds_data
[
0
]
<
lds_data
[
block_size
])
?
lds_data
[
block_size
]
:
lds_data
[
0
];
}
__syncthreads
();
remaining_item_num
-=
block_size
;
}
item_num
-=
block_size
;
}
auto
batch_max
=
lds_data
[
block_size
]
;
__syncthreads
();
const
size_t
block_size1
=
block_size
+
1
;
lds_data
[
block_size1
]
=
0
;
item_num
=
n_dims
;
for
(
size_t
i
=
thr_idx
;
i
<
n_dims
;
i
+=
block_size
)
lds_data
[
block_size
]
=
0
;
remaining_item_num
=
batch_item_num
;
for
(
size_t
i
=
thr_idx
;
i
<
round_item_num
;
i
+=
block_size
)
{
if
(
i
<
batch_item_num
)
{
data_idx
[
axis
]
=
i
;
lds_data
[
i
]
=
input_ptr
[
desc_data
.
linear
(
data_idx
)]
-
lds_data
[
block_size
];
lds_data
[
i
]
=
::
exp
(
to_hip_type
(
lds_data
[
i
]));
__syncthreads
();
data_idx
[
axis
]
=
i
;
lds_data
[
thr_idx
]
=
input
[
data_idx
]
-
batch_max
;
lds_data
[
thr_idx
]
=
::
exp
(
to_hip_type
(
lds_data
[
thr_idx
]));
}
auto
size
=
(
item_num
>
block_size
)
?
block_size
:
item_num
;
auto
stride
=
(
size
+
1
)
/
2
;
while
(
true
)
{
if
(
thr_idx
+
stride
<
size
)
{
lds_data
[
thr_idx
]
+=
lds_data
[
thr_idx
+
stride
];
}
__syncthreads
();
size
=
stride
;
stride
=
(
stride
+
1
)
/
2
;
if
(
size
==
1
)
break
;
}
__syncthreads
();
if
(
thr_idx
==
0
)
{
lds_data
[
block_size1
]
+=
lds_data
[
0
];
}
__syncthreads
();
auto
item_num
=
(
remaining_item_num
>
block_size
)
?
block_size
:
remaining_item_num
;
reduce_sum
(
lds_data
,
block_size
,
thr_idx
,
item_num
);
item_num
-=
block_size
;
}
remaining_item_num
-=
block_size
;
}
auto
batch_sum
=
lds_data
[
block_size
];
for
(
size_t
i
=
thr_idx
;
i
<
n_dims
;
i
+=
block_size
)
{
data_idx
[
axis
]
=
i
;
size_t
index
=
desc_data
.
linear
(
data_idx
);
auto
val
=
input_ptr
[
index
]
-
lds_data
[
block_size
];
output_ptr
[
index
]
=
::
exp
(
to_hip_type
(
val
))
/
lds_data
[
block_size1
];
}
});
for
(
size_t
i
=
thr_idx
;
i
<
batch_item_num
;
i
+=
block_size
)
{
data_idx
[
axis
]
=
i
;
auto
val
=
input
[
data_idx
]
-
batch_max
;
output
[
data_idx
]
=
::
exp
(
to_hip_type
(
val
))
/
batch_sum
;
}
});
});
return
args
.
back
();
}
}
// namespace device
...
...
src/targets/gpu/fuse_ops.cpp
View file @
3855c6af
...
...
@@ -5,6 +5,7 @@
#include <migraphx/gpu/device/add_relu.hpp>
#include <migraphx/gpu/device/add.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/array.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -122,13 +123,6 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
s
.
strides
()[
1
]
!=
0
and
s
.
strides
()[
2
]
==
0
and
s
.
strides
()[
3
]
==
0
;
}
// TODO: Move to another header
template
<
class
T
,
class
...
Ts
>
std
::
array
<
T
,
sizeof
...(
Ts
)
+
1
>
make_array
(
T
x
,
Ts
...
xs
)
{
return
{
std
::
move
(
x
),
std
::
move
(
static_cast
<
T
>
(
xs
))...};
}
MIGRAPHX_PRED_MATCHER
(
fusable_conv
,
instruction_ref
ins
)
{
if
(
ins
->
name
()
!=
"gpu::convolution"
)
...
...
@@ -408,8 +402,8 @@ void fuse_ops::apply(program& p) const
// clang-format off
match
::
find_matches
(
p
,
find_triadd
{});
match
::
find_matches
(
p
,
find_conv_bias_relu
{
ctx
},
find_conv_bias
{
ctx
},
//
find_conv_bias_relu{ctx},
//
find_conv_bias{ctx},
find_add_relu
{}
);
// clang-format on
...
...
src/targets/gpu/gather.cpp
View file @
3855c6af
...
...
@@ -12,11 +12,9 @@ shape hip_gather::compute_shape(std::vector<shape> inputs) const
return
op
.
compute_shape
(
inputs
);
}
argument
hip_gather
::
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
argument
hip_gather
::
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
return
device
::
gather
(
ctx
.
get_stream
().
get
(),
output_shape
,
args
,
op
.
axis
);
return
device
::
gather
(
ctx
.
get_stream
().
get
(),
args
.
back
(),
args
[
0
]
,
args
[
1
]
,
op
.
axis
);
}
}
// namespace gpu
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment