Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
faefeef9
Unverified
Commit
faefeef9
authored
May 25, 2022
by
Charlie Lin
Committed by
GitHub
May 25, 2022
Browse files
Merge branch 'develop' into dyn_shape_update
parents
97a40ac3
bf0a4713
Changes
94
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
685 additions
and
583 deletions
+685
-583
src/reduce_dims.cpp
src/reduce_dims.cpp
+18
-4
src/rewrite_batchnorm.cpp
src/rewrite_batchnorm.cpp
+9
-9
src/rewrite_pooling.cpp
src/rewrite_pooling.cpp
+6
-7
src/rewrite_rnn.cpp
src/rewrite_rnn.cpp
+303
-310
src/schedule.cpp
src/schedule.cpp
+37
-37
src/simplify_algebra.cpp
src/simplify_algebra.cpp
+78
-78
src/simplify_qdq.cpp
src/simplify_qdq.cpp
+1
-1
src/simplify_reshapes.cpp
src/simplify_reshapes.cpp
+37
-37
src/targets/cpu/lowering.cpp
src/targets/cpu/lowering.cpp
+1
-1
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+1
-0
src/targets/gpu/analyze_streams.cpp
src/targets/gpu/analyze_streams.cpp
+10
-10
src/targets/gpu/compile_hip.cpp
src/targets/gpu/compile_hip.cpp
+11
-0
src/targets/gpu/eliminate_workspace.cpp
src/targets/gpu/eliminate_workspace.cpp
+5
-5
src/targets/gpu/fuse_ops.cpp
src/targets/gpu/fuse_ops.cpp
+96
-50
src/targets/gpu/gemm_impl.cpp
src/targets/gpu/gemm_impl.cpp
+38
-6
src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/gemm.hpp
src/targets/gpu/include/migraphx/gpu/gemm.hpp
+7
-25
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+24
-0
No files found.
src/reduce_dims.cpp
View file @
faefeef9
...
...
@@ -16,10 +16,8 @@ bool reduce_dim(std::vector<shape>& shapes, std::size_t n)
auto
bstride
=
s
.
strides
()[
n
+
1
];
auto
blen
=
s
.
lens
()[
n
+
1
];
if
(
astride
==
bstride
*
blen
)
{
if
(
astride
==
bstride
*
blen
or
alen
==
1
)
new_lens
.
push_back
(
alen
*
blen
);
}
}
if
(
new_lens
.
size
()
!=
shapes
.
size
())
return
false
;
...
...
@@ -37,10 +35,25 @@ bool reduce_dim(std::vector<shape>& shapes, std::size_t n)
return
true
;
}
void
reduce_dim1
(
std
::
vector
<
shape
>&
shapes
)
{
if
(
std
::
any_of
(
shapes
.
begin
(),
shapes
.
end
(),
[
&
](
const
auto
&
s
)
{
return
s
.
lens
().
size
()
<
2
or
s
.
lens
().
back
()
!=
1
;
}))
return
;
for
(
auto
&
s
:
shapes
)
{
auto
lens
=
s
.
lens
();
auto
strides
=
s
.
strides
();
lens
.
pop_back
();
strides
.
pop_back
();
s
=
shape
{
s
.
type
(),
lens
,
strides
};
}
}
std
::
size_t
reduce_dim_all
(
std
::
vector
<
shape
>&
shapes
,
std
::
size_t
n
)
{
while
(
reduce_dim
(
shapes
,
n
)
and
n
<
shapes
.
size
())
{}
return
n
+
1
;
}
void
reduce_dim_all
(
std
::
vector
<
shape
>&
shapes
)
...
...
@@ -48,6 +61,7 @@ void reduce_dim_all(std::vector<shape>& shapes)
std
::
size_t
n
=
0
;
while
(
n
<
shapes
.
front
().
lens
().
size
()
-
1
)
n
=
reduce_dim_all
(
shapes
,
n
);
reduce_dim1
(
shapes
);
}
std
::
vector
<
std
::
size_t
>
base_lens
(
const
std
::
vector
<
shape
>&
shapes
)
...
...
src/rewrite_batchnorm.cpp
View file @
faefeef9
...
...
@@ -14,9 +14,9 @@
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
void
rewrite_batchnorm
::
apply
(
module
&
p
)
const
void
rewrite_batchnorm
::
apply
(
module
&
m
)
const
{
for
(
auto
ins
:
iterator_for
(
p
))
for
(
auto
ins
:
iterator_for
(
m
))
{
if
(
ins
->
name
()
!=
"batch_norm_inference"
)
continue
;
...
...
@@ -46,13 +46,13 @@ void rewrite_batchnorm::apply(module& p) const
});
auto
broadcast
=
op
::
broadcast
{
1
,
ins
->
get_shape
().
lens
()};
auto
a_ins
=
p
.
add_literal
({
a
.
get_shape
(),
a
.
data
()});
auto
a_broadcast
=
p
.
insert_instruction
(
ins
,
broadcast
,
a_ins
);
auto
mul
=
p
.
insert_instruction
(
ins
,
make_op
(
"mul"
),
ins
->
inputs
().
front
(),
a_broadcast
);
auto
b_ins
=
p
.
add_literal
({
b
.
get_shape
(),
b
.
data
()});
auto
b_broadcast
=
p
.
insert_instruction
(
ins
,
broadcast
,
b_ins
);
auto
add
=
p
.
insert_instruction
(
ins
,
make_op
(
"add"
),
mul
,
b_broadcast
);
p
.
replace_instruction
(
ins
,
add
);
auto
a_ins
=
m
.
add_literal
({
a
.
get_shape
(),
a
.
data
()});
auto
a_broadcast
=
m
.
insert_instruction
(
ins
,
broadcast
,
a_ins
);
auto
mul
=
m
.
insert_instruction
(
ins
,
make_op
(
"mul"
),
ins
->
inputs
().
front
(),
a_broadcast
);
auto
b_ins
=
m
.
add_literal
({
b
.
get_shape
(),
b
.
data
()});
auto
b_broadcast
=
m
.
insert_instruction
(
ins
,
broadcast
,
b_ins
);
auto
add
=
m
.
insert_instruction
(
ins
,
make_op
(
"add"
),
mul
,
b_broadcast
);
m
.
replace_instruction
(
ins
,
add
);
}
}
...
...
src/rewrite_pooling.cpp
View file @
faefeef9
...
...
@@ -12,9 +12,9 @@
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
void
rewrite_pooling
::
apply
(
module
&
prog
)
const
void
rewrite_pooling
::
apply
(
module
&
m
)
const
{
for
(
auto
ins
:
iterator_for
(
prog
))
for
(
auto
ins
:
iterator_for
(
m
))
{
if
(
ins
->
name
()
!=
"pooling"
)
continue
;
...
...
@@ -33,26 +33,25 @@ void rewrite_pooling::apply(module& prog) const
continue
;
std
::
int64_t
n
=
s
.
lens
()[
0
];
std
::
int64_t
c
=
s
.
lens
()[
1
];
auto
reshape
=
prog
.
insert_instruction
(
auto
reshape
=
m
.
insert_instruction
(
ins
,
make_op
(
"reshape"
,
{{
"dims"
,
{
n
*
c
,
-
1
}}}),
ins
->
inputs
().
front
());
instruction_ref
pooling
{};
// average pooling
if
(
op
.
mode
==
op
::
pooling_mode
::
average
)
{
pooling
=
prog
.
insert_instruction
(
ins
,
make_op
(
"reduce_mean"
,
{{
"axes"
,
{
1
}}}),
reshape
);
pooling
=
m
.
insert_instruction
(
ins
,
make_op
(
"reduce_mean"
,
{{
"axes"
,
{
1
}}}),
reshape
);
}
// max pooling
else
{
pooling
=
prog
.
insert_instruction
(
ins
,
make_op
(
"reduce_max"
,
{{
"axes"
,
{
1
}}}),
reshape
);
pooling
=
m
.
insert_instruction
(
ins
,
make_op
(
"reduce_max"
,
{{
"axes"
,
{
1
}}}),
reshape
);
}
std
::
vector
<
int64_t
>
rsp_lens
(
lens
.
size
(),
1
);
rsp_lens
[
0
]
=
n
;
rsp_lens
[
1
]
=
c
;
prog
.
replace_instruction
(
ins
,
make_op
(
"reshape"
,
{{
"dims"
,
rsp_lens
}}),
pooling
);
m
.
replace_instruction
(
ins
,
make_op
(
"reshape"
,
{{
"dims"
,
rsp_lens
}}),
pooling
);
}
}
...
...
src/rewrite_rnn.cpp
View file @
faefeef9
This diff is collapsed.
Click to expand it.
src/schedule.cpp
View file @
faefeef9
...
...
@@ -42,7 +42,7 @@ struct stream_info
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
iweights
;
ins_dep_map
mod_implicit_deps
;
void
calc_implicit_deps
(
const
module
&
p
)
{
mod_implicit_deps
=
p
.
calc_implicit_deps
();
}
void
calc_implicit_deps
(
const
module
&
m
)
{
mod_implicit_deps
=
m
.
calc_implicit_deps
();
}
void
accumulate_weights
(
instruction_ref
last
,
const
schedule_model
&
model
)
{
...
...
@@ -116,15 +116,15 @@ struct stream_info
}
};
std
::
size_t
assign_streams
(
module
&
p
,
std
::
size_t
n
)
std
::
size_t
assign_streams
(
module
&
m
,
std
::
size_t
n
)
{
assert
(
n
>
0
);
partition
critical
;
std
::
unordered_map
<
instruction_ref
,
std
::
deque
<
partition
>>
partitions
;
partitions
.
reserve
(
weights
.
size
());
fix
([
&
](
auto
self
,
auto
ins
,
auto
&
part
)
{
assert
(
not
is_end
(
ins
,
p
.
end
()));
if
(
not
p
.
has_instruction
(
ins
))
assert
(
not
is_end
(
ins
,
m
.
end
()));
if
(
not
m
.
has_instruction
(
ins
))
return
;
if
(
contains
(
partitions
,
ins
))
return
;
...
...
@@ -151,8 +151,8 @@ struct stream_info
}
}
// Sort instructions
p
.
move_instruction
(
ins
,
p
.
end
());
})(
std
::
prev
(
p
.
end
()),
critical
);
m
.
move_instruction
(
ins
,
m
.
end
());
})(
std
::
prev
(
m
.
end
()),
critical
);
// Set the critical partition to stream 0
set_stream
(
critical
,
0
);
...
...
@@ -197,13 +197,13 @@ struct stream_info
}
};
void
sort
(
module
&
p
,
std
::
size_t
)
void
sort
(
module
&
m
,
std
::
size_t
)
{
std
::
set
<
weight_ins
,
compare_weight_ins
>
children
;
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
visited
;
auto
last
=
std
::
prev
(
p
.
end
());
auto
last
=
std
::
prev
(
m
.
end
());
auto
mw
=
this
->
weights
.
at
(
last
);
auto
nw
=
mw
/
(
p
.
size
()
+
1
);
auto
nw
=
mw
/
(
m
.
size
()
+
1
);
auto
add_child
=
[
&
](
auto
ins
)
{
auto
x
=
1
+
(
mw
-
this
->
weights
.
at
(
ins
))
/
(
nw
+
1
);
auto
w
=
x
*
this
->
iweights
.
at
(
ins
);
...
...
@@ -222,10 +222,10 @@ struct stream_info
// Pop the first element
auto
top
=
children
.
begin
()
->
second
;
children
.
erase
(
children
.
begin
());
p
.
move_instruction
(
top
,
p
.
begin
());
m
.
move_instruction
(
top
,
m
.
begin
());
for
(
auto
ins
:
top
->
inputs
())
{
if
(
not
p
.
has_instruction
(
ins
))
if
(
not
m
.
has_instruction
(
ins
))
continue
;
add_child
(
ins
);
}
...
...
@@ -234,7 +234,7 @@ struct stream_info
{
for
(
auto
ins
:
mod_implicit_deps
.
at
(
top
))
{
assert
(
p
.
has_instruction
(
ins
));
assert
(
m
.
has_instruction
(
ins
));
add_child
(
ins
);
}
}
...
...
@@ -242,12 +242,12 @@ struct stream_info
// move dangling parameter to the front so as not be removed
auto
ins
=
std
::
next
(
last
);
while
(
ins
!=
p
.
end
())
while
(
ins
!=
m
.
end
())
{
auto
next
=
std
::
next
(
ins
);
if
(
ins
->
name
()
==
"@param"
)
{
p
.
move_instruction
(
ins
,
p
.
begin
());
m
.
move_instruction
(
ins
,
m
.
begin
());
}
ins
=
next
;
}
...
...
@@ -364,18 +364,18 @@ struct stream_info
}
std
::
unordered_map
<
instruction_ref
,
std
::
vector
<
std
::
vector
<
instruction_ref
>>>
find_concurrent_instructions
(
module
&
p
)
const
find_concurrent_instructions
(
module
&
m
)
const
{
std
::
unordered_map
<
instruction_ref
,
std
::
vector
<
std
::
vector
<
instruction_ref
>>>
result
;
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
instruction_ref
>>
merge_from
;
dominator_info
di
=
compute_dominator
(
p
);
result
.
reserve
(
p
.
size
());
merge_from
.
reserve
(
p
.
size
());
for
(
auto
ins
:
reverse_iterator_for
(
p
))
dominator_info
di
=
compute_dominator
(
m
);
result
.
reserve
(
m
.
size
());
merge_from
.
reserve
(
m
.
size
());
for
(
auto
ins
:
reverse_iterator_for
(
m
))
{
for
(
auto
&&
arg
:
ins
->
outputs
())
{
if
(
not
p
.
has_instruction
(
arg
))
if
(
not
m
.
has_instruction
(
arg
))
continue
;
if
(
is_merge_point
(
arg
))
merge_from
[
ins
].
insert
(
arg
);
...
...
@@ -415,18 +415,18 @@ struct stream_info
}
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
instruction_ref
>>
get_conflicts
(
module
&
p
)
get_conflicts
(
module
&
m
)
{
using
conflict_table_type
=
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
instruction_ref
>>
;
conflict_table_type
conflict_table
;
auto
concur_ins
=
this
->
find_concurrent_instructions
(
p
);
auto
concur_ins
=
this
->
find_concurrent_instructions
(
m
);
// Compute an index for each instruction
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
ins2index
;
std
::
size_t
index_total
=
0
;
for
(
auto
ins
:
iterator_for
(
p
))
for
(
auto
ins
:
iterator_for
(
m
))
ins2index
[
ins
]
=
index_total
++
;
std
::
vector
<
conflict_table_type
>
thread_conflict_tables
(
...
...
@@ -507,21 +507,21 @@ struct stream_info
}
};
void
schedule
::
apply
(
module
&
p
)
const
void
schedule
::
apply
(
module
&
m
)
const
{
if
(
not
enable
)
return
;
stream_info
si
;
si
.
calc_implicit_deps
(
p
);
auto
last
=
std
::
prev
(
p
.
end
());
si
.
calc_implicit_deps
(
m
);
auto
last
=
std
::
prev
(
m
.
end
());
si
.
accumulate_weights
(
last
,
model
);
auto
nstreams
=
si
.
assign_streams
(
p
,
model
.
concurrency
());
si
.
sort
(
p
,
model
.
concurrency
());
auto
nstreams
=
si
.
assign_streams
(
m
,
model
.
concurrency
());
si
.
sort
(
m
,
model
.
concurrency
());
if
(
enabled
(
MIGRAPHX_TRACE_COMPILE
{})
or
enabled
(
MIGRAPHX_TRACE_SCHEDULE
{}))
{
p
.
annotate
(
std
::
cout
,
[
&
](
auto
ins
)
{
m
.
annotate
(
std
::
cout
,
[
&
](
auto
ins
)
{
if
(
ins
->
name
()
==
"@param"
and
not
contains
(
si
.
weights
,
ins
))
return
;
...
...
@@ -548,9 +548,9 @@ void schedule::apply(module& p) const
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
ins2wait
;
std
::
unordered_map
<
std
::
size_t
,
std
::
unordered_set
<
std
::
size_t
>>
waited_for
;
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
std
::
size_t
>>
ins2waited
;
ins2wait
.
reserve
(
p
.
size
());
ins2waited
.
reserve
(
p
.
size
());
for
(
auto
ins
:
iterator_for
(
p
))
ins2wait
.
reserve
(
m
.
size
());
ins2waited
.
reserve
(
m
.
size
());
for
(
auto
ins
:
iterator_for
(
m
))
{
// Only schedule instructions that have a stream
if
(
not
si
.
has_stream
(
ins
))
...
...
@@ -559,7 +559,7 @@ void schedule::apply(module& p) const
// Schedule instruction on the stream
auto
stream
=
si
.
get_stream
(
ins
);
assert
(
stream
<
model
.
concurrency
());
model
.
sched
(
p
,
ins
,
stream
);
model
.
sched
(
m
,
ins
,
stream
);
// Insert wait instructions
if
(
si
.
is_merge_point
(
ins
,
stream
))
{
...
...
@@ -572,14 +572,14 @@ void schedule::apply(module& p) const
if
(
not
contains
(
ins2wait
,
i
))
{
ins2wait
[
i
]
=
wait_id
;
model
.
record
(
p
,
i
,
wait_id
);
model
.
record
(
m
,
i
,
wait_id
);
wait_id
++
;
}
auto
w
=
ins2wait
.
at
(
i
);
// If we already waited for the event on this stream then dont
// insert another wait event
if
(
not
contains
(
waited_for
[
stream
],
w
))
model
.
wait
(
p
,
ins
,
w
);
model
.
wait
(
m
,
ins
,
w
);
// Store the event as waited
waited_for
[
stream
].
insert
(
w
);
// Store all wait events that have been waited on prior to the recorded instruction
...
...
@@ -594,7 +594,7 @@ void schedule::apply(module& p) const
}
// Add memory conflicts
auto
conflict_table
=
si
.
get_conflicts
(
p
);
auto
conflict_table
=
si
.
get_conflicts
(
m
);
for
(
auto
&&
ip
:
conflict_table
)
{
if
(
ip
.
second
.
empty
())
...
...
@@ -602,7 +602,7 @@ void schedule::apply(module& p) const
std
::
vector
<
instruction_ref
>
args
;
args
.
push_back
(
ip
.
first
);
args
.
insert
(
args
.
end
(),
ip
.
second
.
begin
(),
ip
.
second
.
end
());
p
.
insert_instruction
(
std
::
next
(
ip
.
first
),
make_op
(
"identity"
),
args
);
m
.
insert_instruction
(
std
::
next
(
ip
.
first
),
make_op
(
"identity"
),
args
);
}
}
...
...
src/simplify_algebra.cpp
View file @
faefeef9
This diff is collapsed.
Click to expand it.
src/simplify_qdq.cpp
View file @
faefeef9
...
...
@@ -53,7 +53,7 @@ struct match_find_quantizable_ops
match
::
arg
(
1
)(
dequantizelinear_op
(
"x2"
,
"scale2"
)));
}
void
apply
(
module
&
m
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
qop
=
r
.
result
;
auto
q1
=
r
.
instructions
[
"x1"
];
...
...
src/simplify_reshapes.cpp
View file @
faefeef9
...
...
@@ -70,19 +70,19 @@ struct find_reshaper
match
::
any_of
[
match
::
outputs
()](
match
::
name
(
reshaper_names
())));
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
mr
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
std
::
vector
<
instruction_ref
>
reshapes
{
ins
};
while
(
is_reshaper
(
reshapes
.
back
()))
{
assert
(
!
reshapes
.
back
()
->
inputs
().
empty
());
assert
(
p
.
has_instruction
(
reshapes
.
back
()
->
inputs
().
front
()));
assert
(
m
.
has_instruction
(
reshapes
.
back
()
->
inputs
().
front
()));
auto
input
=
reshapes
.
back
()
->
inputs
().
front
();
reshapes
.
push_back
(
input
);
}
std
::
pair
<
instruction_ref
,
instruction_ref
>
r
{
p
.
end
(),
p
.
end
()};
std
::
pair
<
instruction_ref
,
instruction_ref
>
r
{
m
.
end
(),
m
.
end
()};
for
(
auto
start
:
iterator_for
(
reshapes
))
{
auto
last
=
std
::
find_if
(
reshapes
.
rbegin
(),
reshapes
.
rend
(),
[
&
](
auto
&&
i
)
{
...
...
@@ -96,7 +96,7 @@ struct find_reshaper
}
if
(
r
.
first
!=
r
.
second
)
{
p
.
replace_instruction
(
r
.
first
,
r
.
second
);
m
.
replace_instruction
(
r
.
first
,
r
.
second
);
}
}
};
...
...
@@ -117,10 +117,10 @@ struct find_nop_reshapes
return
match
::
name
(
reshapes
)(
match
::
same_shape
(
match
::
arg
(
0
)));
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
mr
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
p
.
replace_instruction
(
ins
,
ins
->
inputs
().
front
());
m
.
replace_instruction
(
ins
,
ins
->
inputs
().
front
());
}
};
...
...
@@ -132,7 +132,7 @@ struct find_transpose
match
::
skip_output
(
match
::
name
(
"contiguous"
))(
match
::
name
(
"transpose"
))));
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
mr
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
x
=
ins
;
...
...
@@ -149,11 +149,11 @@ struct find_transpose
return
;
if
(
is_no_transpose
(
dims
))
{
p
.
replace_instruction
(
ins
,
t
->
inputs
().
front
());
m
.
replace_instruction
(
ins
,
t
->
inputs
().
front
());
}
else
{
p
.
replace_instruction
(
m
.
replace_instruction
(
ins
,
make_op
(
"transpose"
,
{{
"permutation"
,
dims
}}),
t
->
inputs
().
front
());
}
}
...
...
@@ -223,7 +223,7 @@ struct find_nested_slice
return
result
;
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
mr
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
slice
=
ins
->
inputs
().
front
();
...
...
@@ -241,7 +241,7 @@ struct find_nested_slice
op
.
starts
.
push_back
(
pp
.
second
.
first
);
op
.
ends
.
push_back
(
pp
.
second
.
second
);
}
p
.
replace_instruction
(
ins
,
op
,
input
);
m
.
replace_instruction
(
ins
,
op
,
input
);
}
};
...
...
@@ -252,7 +252,7 @@ struct find_concat_transpose
return
match
::
name
(
"concat"
)(
match
::
all_of
[
match
::
inputs
()](
match
::
transpose_shape
()));
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
mr
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
trans_inputs
=
ins
->
inputs
();
...
...
@@ -279,14 +279,14 @@ struct find_concat_transpose
std
::
vector
<
instruction_ref
>
inputs
;
std
::
transform
(
ins
->
inputs
().
begin
(),
ins
->
inputs
().
end
(),
std
::
back_inserter
(
inputs
),
[
&
](
auto
i
)
{
return
p
.
insert_instruction
(
return
m
.
insert_instruction
(
ins
,
make_op
(
"transpose"
,
{{
"permutation"
,
permutation
}}),
i
);
});
auto
concat
=
p
.
insert_instruction
(
ins
,
op
,
inputs
);
auto
t
=
p
.
insert_instruction
(
auto
concat
=
m
.
insert_instruction
(
ins
,
op
,
inputs
);
auto
t
=
m
.
insert_instruction
(
ins
,
make_op
(
"transpose"
,
{{
"permutation"
,
ipermutation
}}),
concat
);
assert
(
ins
->
get_shape
().
lens
()
==
t
->
get_shape
().
lens
());
p
.
replace_instruction
(
ins
,
t
);
m
.
replace_instruction
(
ins
,
t
);
}
};
...
...
@@ -303,7 +303,7 @@ struct find_nested_concat
return
op
.
axis
;
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
mr
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
axis
=
get_axis
(
ins
);
...
...
@@ -317,7 +317,7 @@ struct find_nested_concat
args
.
push_back
(
i
);
}
})(
ins
->
inputs
());
p
.
replace_instruction
(
ins
,
ins
->
get_operator
(),
args
);
m
.
replace_instruction
(
ins
,
ins
->
get_operator
(),
args
);
}
};
...
...
@@ -329,7 +329,7 @@ struct find_resize
match
::
args
(
match
::
name
(
"reshape"
).
bind
(
"data"
),
match
::
is_constant
().
bind
(
"ind"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
ins_rsp
=
r
.
instructions
[
"data"
];
...
...
@@ -417,13 +417,13 @@ struct find_resize
}
auto
in_rsp
=
ins_rsp
->
inputs
().
front
();
auto
rsp_data
=
p
.
insert_instruction
(
auto
rsp_data
=
m
.
insert_instruction
(
ins_rsp
,
migraphx
::
make_op
(
"reshape"
,
{{
"dims"
,
in_dims
}}),
in_rsp
);
auto
mb_rsp
=
p
.
insert_instruction
(
auto
mb_rsp
=
m
.
insert_instruction
(
ins_rsp
,
migraphx
::
make_op
(
"multibroadcast"
,
{{
"out_lens"
,
out_dims
}}),
rsp_data
);
auto
std_mb
=
p
.
insert_instruction
(
ins
,
migraphx
::
make_op
(
"contiguous"
),
mb_rsp
);
auto
std_mb
=
m
.
insert_instruction
(
ins
,
migraphx
::
make_op
(
"contiguous"
),
mb_rsp
);
std
::
vector
<
int64_t
>
rsp_dims
(
out_lens
.
begin
(),
out_lens
.
end
());
p
.
replace_instruction
(
ins
,
migraphx
::
make_op
(
"reshape"
,
{{
"dims"
,
rsp_dims
}}),
std_mb
);
m
.
replace_instruction
(
ins
,
migraphx
::
make_op
(
"reshape"
,
{{
"dims"
,
rsp_dims
}}),
std_mb
);
}
};
...
...
@@ -436,7 +436,7 @@ struct find_where_op
match
::
is_constant
().
bind
(
"ind"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
concat
=
r
.
instructions
[
"data"
];
...
...
@@ -475,11 +475,11 @@ struct find_where_op
if
(
val
)
{
p
.
replace_instruction
(
ins
,
inputs
.
at
(
0
));
m
.
replace_instruction
(
ins
,
inputs
.
at
(
0
));
}
else
{
p
.
replace_instruction
(
ins
,
inputs
.
at
(
1
));
m
.
replace_instruction
(
ins
,
inputs
.
at
(
1
));
}
}
};
...
...
@@ -496,7 +496,7 @@ struct find_reshape_cont
match
::
any
()));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
ins_cont
=
r
.
instructions
[
"cont"
];
...
...
@@ -530,11 +530,11 @@ struct find_reshape_cont
else
{
inputs
.
push_back
(
p
.
insert_instruction
(
ins
,
make_op
(
"reshape"
,
{{
"dims"
,
dims
}}),
in
));
m
.
insert_instruction
(
ins
,
make_op
(
"reshape"
,
{{
"dims"
,
dims
}}),
in
));
}
}
auto
out
=
p
.
insert_instruction
(
ins
,
ins
->
get_operator
(),
inputs
);
p
.
replace_instruction
(
ins
,
make_op
(
"reshape"
,
{{
"dims"
,
out_dims
}}),
out
);
auto
out
=
m
.
insert_instruction
(
ins
,
ins
->
get_operator
(),
inputs
);
m
.
replace_instruction
(
ins
,
make_op
(
"reshape"
,
{{
"dims"
,
out_dims
}}),
out
);
}
};
...
...
@@ -564,25 +564,25 @@ struct find_transpose_contiguous_reshaper_unary
match
::
args
(
match_transpose_contiguous_reshaper
()));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
reshaper_ins
=
r
.
instructions
[
"reshaper_ins"
];
auto
trans_ins
=
r
.
instructions
[
"trans_ins"
];
auto
cont_ins
=
r
.
instructions
[
"cont_ins"
];
auto
unary_op_name
=
ins
->
get_operator
().
name
();
auto
unary_ins
=
p
.
insert_instruction
(
cont_ins
,
make_op
(
unary_op_name
),
trans_ins
);
auto
new_cont_ins
=
p
.
insert_instruction
(
cont_ins
,
make_op
(
"contiguous"
),
unary_ins
);
auto
unary_ins
=
m
.
insert_instruction
(
cont_ins
,
make_op
(
unary_op_name
),
trans_ins
);
auto
new_cont_ins
=
m
.
insert_instruction
(
cont_ins
,
make_op
(
"contiguous"
),
unary_ins
);
// older cont and reshape are removed by deadcode elimination
p
.
replace_instruction
(
ins
,
reshaper_ins
->
get_operator
(),
new_cont_ins
);
m
.
replace_instruction
(
ins
,
reshaper_ins
->
get_operator
(),
new_cont_ins
);
}
};
void
simplify_reshapes
::
apply
(
module
&
p
)
const
void
simplify_reshapes
::
apply
(
module
&
m
)
const
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
match
::
find_matches
(
p
,
match
::
find_matches
(
m
,
find_where_op
{},
find_resize
{},
find_reshape_cont
{},
...
...
@@ -594,7 +594,7 @@ void simplify_reshapes::apply(module& p) const
find_nested_slice
{},
find_nested_concat
{},
find_transpose_contiguous_reshaper_unary
{});
dead_code_elimination
{}.
apply
(
p
);
dead_code_elimination
{}.
apply
(
m
);
}
}
...
...
src/targets/cpu/lowering.cpp
View file @
faefeef9
...
...
@@ -352,7 +352,7 @@ struct cpu_apply
std
::
transform
(
bind_inputs
.
begin
(),
bind_inputs
.
end
(),
std
::
back_inserter
(
inputs
),
[
&
](
const
auto
&
s
)
{
return
r
.
instructions
.
at
(
s
)
;
});
[
&
](
const
auto
&
s
)
{
return
r
.
instructions
[
s
]
;
});
inputs
.
push_back
(
this
->
insert_allocation
(
ins
,
ins
->
get_shape
()));
modl
->
replace_instruction
(
ins
,
op
,
inputs
);
});
...
...
src/targets/gpu/CMakeLists.txt
View file @
faefeef9
...
...
@@ -158,6 +158,7 @@ add_library(migraphx_gpu
nonzero.cpp
pack_args.cpp
pack_int8_args.cpp
prefuse_ops.cpp
pad.cpp
pooling.cpp
quant_convolution.cpp
...
...
src/targets/gpu/analyze_streams.cpp
View file @
faefeef9
...
...
@@ -28,30 +28,30 @@ struct hip_stream_model
bool
is_wait
(
migraphx
::
instruction_ref
ins
)
const
{
return
ins
->
name
()
==
"gpu::wait_event"
;
}
};
stream_model
make_stream_model
(
const
module
&
p
)
stream_model
make_stream_model
(
const
module
&
m
)
{
hip_stream_model
m
;
hip_stream_model
hs
m
;
std
::
size_t
stream
=
0
;
for
(
auto
ins
:
iterator_for
(
p
))
for
(
auto
ins
:
iterator_for
(
m
))
{
if
(
ins
->
name
()
==
"gpu::set_stream"
)
{
auto
v
=
ins
->
get_operator
().
to_value
();
stream
=
v
[
"stream"
].
to
<
std
::
size_t
>
();
m
.
max_stream
=
std
::
max
(
stream
,
m
.
max_stream
);
auto
v
=
ins
->
get_operator
().
to_value
();
stream
=
v
[
"stream"
].
to
<
std
::
size_t
>
();
hs
m
.
max_stream
=
std
::
max
(
stream
,
hs
m
.
max_stream
);
}
if
(
ins
->
get_operator
().
is_context_free
())
continue
;
if
(
contains
({
"hip::hip_allocate_memory"
,
"hip::hip_copy_literal"
,
"@param"
},
ins
->
name
()))
continue
;
m
.
ins2stream
[
ins
]
=
stream
;
hs
m
.
ins2stream
[
ins
]
=
stream
;
}
return
m
;
return
hs
m
;
}
std
::
vector
<
stream_race
>
analyze_streams
(
const
module
&
p
)
std
::
vector
<
stream_race
>
analyze_streams
(
const
module
&
m
)
{
return
migraphx
::
analyze_streams
(
p
,
make_stream_model
(
p
));
return
migraphx
::
analyze_streams
(
m
,
make_stream_model
(
m
));
}
}
// namespace gpu
...
...
src/targets/gpu/compile_hip.cpp
View file @
faefeef9
...
...
@@ -22,6 +22,7 @@ namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_GPU_DEBUG
);
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_GPU_OPTIMIZE
);
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_GPU_DUMP_ASM
);
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_GPU_DUMP_SRC
);
#if MIGRAPHX_USE_HIPRTC
...
...
@@ -247,6 +248,16 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
MIGRAPHX_THROW
(
"Missing hsaco"
);
};
if
(
enabled
(
MIGRAPHX_GPU_DUMP_SRC
{}))
{
for
(
const
auto
&
src
:
srcs
)
{
if
(
src
.
path
.
extension
()
!=
".cpp"
)
continue
;
std
::
cout
<<
std
::
string
(
src
.
content
.
first
,
src
.
len
())
<<
std
::
endl
;
}
}
if
(
enabled
(
MIGRAPHX_GPU_DUMP_ASM
{}))
{
...
...
src/targets/gpu/eliminate_workspace.cpp
View file @
faefeef9
...
...
@@ -11,11 +11,11 @@ namespace migraphx {
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
void
eliminate_workspace
::
apply
(
module
&
p
)
const
void
eliminate_workspace
::
apply
(
module
&
m
)
const
{
std
::
size_t
n
=
0
;
std
::
vector
<
instruction_ref
>
allocs
;
for
(
auto
ins
:
iterator_for
(
p
))
for
(
auto
ins
:
iterator_for
(
m
))
{
if
(
ins
->
outputs
().
size
()
!=
1
)
continue
;
...
...
@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const
}
if
(
n
>
0
)
{
auto
ws
=
p
.
add_parameter
(
"workspace"
,
shape
{
shape
::
int8_type
,
{
n
}});
auto
ws
=
m
.
add_parameter
(
"workspace"
,
shape
{
shape
::
int8_type
,
{
n
}});
for
(
auto
&&
a
:
allocs
)
{
p
.
replace_instruction
(
a
,
ws
);
p
.
remove_instruction
(
a
);
m
.
replace_instruction
(
a
,
ws
);
m
.
remove_instruction
(
a
);
}
}
}
...
...
src/targets/gpu/fuse_ops.cpp
View file @
faefeef9
...
...
@@ -316,7 +316,7 @@ struct find_layernorm
{
auto
matcher
()
const
{
return
match
::
layernorm
(
&
gpu_name
);
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
...
...
@@ -331,7 +331,7 @@ struct find_layernorm
if
(
relements
>
1024
or
(
relements
%
4
!=
0
and
relements
>
256
))
return
;
p
.
replace_instruction
(
ins
,
hip_layernorm
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_layernorm
{},
x_ins
,
args
.
back
());
}
};
...
...
@@ -343,11 +343,11 @@ struct find_triadd_layernorm
match
::
used_once
(),
match
::
all_of
[
match
::
inputs
()](
match
::
standard_shape
()))));
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
triadd
=
ins
->
inputs
().
front
();
p
.
replace_instruction
(
ins
,
hip_triadd_layernorm
{},
triadd
->
inputs
());
m
.
replace_instruction
(
ins
,
hip_triadd_layernorm
{},
triadd
->
inputs
());
}
};
...
...
@@ -355,13 +355,13 @@ struct find_gelu
{
auto
matcher
()
const
{
return
match
::
gelu_erf
(
&
gpu_name
);
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
args
=
ins
->
inputs
();
p
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
}
};
...
...
@@ -372,7 +372,7 @@ struct find_add_gelu
return
match
::
name
(
"gpu::gelu"
)(
match
::
arg
(
0
)(
match
::
name
(
"gpu::add"
).
bind
(
"add"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
...
...
@@ -381,7 +381,7 @@ struct find_add_gelu
move_broadcasted_back
(
args
);
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_add_gelu
{},
args
);
m
.
replace_instruction
(
ins
,
hip_add_gelu
{},
args
);
}
};
...
...
@@ -391,16 +391,16 @@ struct find_gelu_new
auto
matcher
()
const
{
return
match
::
gelu_tanh
(
&
gpu_name
);
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
args
=
ins
->
inputs
();
if
(
fast_math
)
p
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
else
p
.
replace_instruction
(
ins
,
hip_gelu_new
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_gelu_new
{},
x_ins
,
args
.
back
());
}
};
...
...
@@ -411,7 +411,7 @@ struct find_add_gelu_new
return
match
::
name
(
"gpu::gelu_new"
)(
match
::
arg
(
0
)(
match
::
name
(
"gpu::add"
).
bind
(
"add"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
...
...
@@ -420,7 +420,7 @@ struct find_add_gelu_new
move_broadcasted_back
(
args
);
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_add_gelu_new
{},
args
);
m
.
replace_instruction
(
ins
,
hip_add_gelu_new
{},
args
);
}
};
...
...
@@ -435,7 +435,7 @@ struct find_add_clip
.
bind
(
"add"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
...
...
@@ -448,9 +448,9 @@ struct find_add_clip
add_args
.
pop_back
();
add_args
.
insert
(
add_args
.
end
(),
std
::
next
(
ins_args
.
begin
()),
ins_args
.
end
());
if
(
add_ins
->
name
()
==
"gpu::add"
)
p
.
replace_instruction
(
ins
,
hip_add_clip
{},
add_args
);
m
.
replace_instruction
(
ins
,
hip_add_clip
{},
add_args
);
else
if
(
add_ins
->
name
()
==
"gpu::triadd"
)
p
.
replace_instruction
(
ins
,
hip_triadd_clip
{},
add_args
);
m
.
replace_instruction
(
ins
,
hip_triadd_clip
{},
add_args
);
}
};
...
...
@@ -470,7 +470,7 @@ struct find_add_unary
.
bind
(
"add"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
...
...
@@ -481,9 +481,9 @@ struct find_add_unary
// Use the allocation from the relu operator
args
.
back
()
=
ins
->
inputs
().
back
();
if
(
add_ins
->
name
()
==
"gpu::add"
)
p
.
replace_instruction
(
ins
,
binary_add_op
,
args
);
m
.
replace_instruction
(
ins
,
binary_add_op
,
args
);
else
if
(
add_ins
->
name
()
==
"gpu::triadd"
)
p
.
replace_instruction
(
ins
,
ternary_add_op
,
args
);
m
.
replace_instruction
(
ins
,
ternary_add_op
,
args
);
}
};
...
...
@@ -498,7 +498,7 @@ struct find_triadd
.
bind
(
"input"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
input_ins
=
r
.
instructions
[
"input"
];
...
...
@@ -513,7 +513,7 @@ struct find_triadd
move_broadcasted_back
(
args
);
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_triadd
{},
args
);
m
.
replace_instruction
(
ins
,
hip_triadd
{},
args
);
}
};
...
...
@@ -525,7 +525,7 @@ struct find_mul_add
match
::
name
(
"gpu::mul"
)(
match
::
used_once
()).
bind
(
"mul"
),
match
::
any
().
bind
(
"b"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
mul_ins
=
r
.
instructions
[
"mul"
];
auto
b_ins
=
r
.
instructions
[
"b"
];
...
...
@@ -538,7 +538,7 @@ struct find_mul_add
args
.
insert
(
std
::
prev
(
args
.
end
()),
b_ins
);
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_mul_add
{},
args
);
m
.
replace_instruction
(
ins
,
hip_mul_add
{},
args
);
}
};
...
...
@@ -550,7 +550,7 @@ struct find_mul_add_relu
match
::
arg
(
0
)(
match
::
name
(
"gpu::mul_add"
)(
match
::
used_once
()).
bind
(
"mul_add"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
mul_add_ins
=
r
.
instructions
[
"mul_add"
];
auto
ins
=
r
.
result
;
...
...
@@ -558,7 +558,7 @@ struct find_mul_add_relu
// Use the allocation from the relu operator
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_mul_add_relu
{},
args
);
m
.
replace_instruction
(
ins
,
hip_mul_add_relu
{},
args
);
}
};
...
...
@@ -783,7 +783,7 @@ auto conv_bias(Ms... ms)
}
template
<
class
Op
>
void
apply_conv_bias
(
context
&
ctx
,
module
&
p
,
match
::
matcher_result
r
)
void
apply_conv_bias
(
context
&
ctx
,
module
&
m
,
const
match
::
matcher_result
&
r
)
{
auto
conv_ins
=
r
.
instructions
[
"conv"
];
auto
bias_ins
=
r
.
instructions
[
"bias"
];
...
...
@@ -798,7 +798,7 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
// TODO: Insert ws allocation
auto
ws
=
cb
.
get_workspace
(
ctx
);
(
void
)
ws
;
p
.
replace_instruction
(
ins
,
cb
,
input_ins
,
weights_ins
,
old_ws_ins
,
bias_ins
,
alloc_ins
);
m
.
replace_instruction
(
ins
,
cb
,
input_ins
,
weights_ins
,
old_ws_ins
,
bias_ins
,
alloc_ins
);
}
inline
auto
precompile_name
(
std
::
string
s
)
// NOLINT
...
...
@@ -829,9 +829,9 @@ struct find_conv_bias
match
::
output
(
match
::
name
(
std
::
unordered_set
<
std
::
string
>
{
"gpu::relu"
}))));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
apply_conv_bias
<
miopen_conv_bias
>
(
*
ctx
,
p
,
std
::
move
(
r
)
);
apply_conv_bias
<
miopen_conv_bias
>
(
*
ctx
,
m
,
r
);
}
};
...
...
@@ -840,9 +840,9 @@ struct find_conv_bias_relu
context
*
ctx
=
nullptr
;
auto
matcher
()
const
{
return
match
::
name
(
"gpu::relu"
)(
match
::
arg
(
0
)(
conv_bias
()));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
apply_conv_bias
<
miopen_conv_bias_relu
>
(
*
ctx
,
p
,
std
::
move
(
r
)
);
apply_conv_bias
<
miopen_conv_bias_relu
>
(
*
ctx
,
m
,
r
);
}
};
...
...
@@ -857,7 +857,7 @@ struct find_conv_pointwise
fusable_conv
(
match
::
used_once
()).
bind
(
"conv"
)));
}
void
apply
(
module
&
m
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
conv_ins
=
r
.
instructions
[
"conv"
];
auto
bias_ins
=
r
.
instructions
[
"bias"
];
...
...
@@ -896,7 +896,7 @@ struct find_gemm_add
match
::
name
(
"gpu::gemm"
)(
match
::
nargs
(
3
)).
bind
(
"gemm"
)));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
gemm_ins
=
r
.
instructions
[
"gemm"
];
...
...
@@ -908,26 +908,68 @@ struct find_gemm_add
if
(
not
float_equal
(
gemm
.
beta
,
0
))
return
;
if
(
std
::
any_of
(
ins
->
inputs
().
begin
(),
ins
->
inputs
().
end
(),
[](
auto
i
)
{
return
not
i
->
get_shape
().
standard
();
}))
return
;
auto
inputs
=
gemm_ins
->
inputs
();
inputs
.
pop_back
();
auto
copy_ins
=
c_ins
;
// Insert copy
if
(
ins
==
p
.
end
()
or
c_ins
->
outputs
().
size
()
>
1
or
c_ins
->
inputs
().
empty
())
if
(
ins
==
m
.
end
()
or
c_ins
->
outputs
().
size
()
>
1
or
c_ins
->
inputs
().
empty
())
{
copy_ins
=
p
.
insert_instruction
(
ins
,
hip_copy
{},
c_ins
,
ins
->
inputs
().
back
());
copy_ins
=
m
.
insert_instruction
(
ins
,
hip_copy
{},
c_ins
,
ins
->
inputs
().
back
());
}
inputs
.
push_back
(
copy_ins
);
inputs
.
push_back
(
copy_ins
);
gemm
.
beta
=
1
;
p
.
replace_instruction
(
ins
,
gemm
,
inputs
);
m
.
replace_instruction
(
ins
,
gemm
,
inputs
);
}
};
auto
pointwise_name
(
const
std
::
string
&
s
)
{
return
precompile_name
(
"pointwise"
)(
match
::
make_basic_pred_matcher
([
=
](
auto
ins
)
{
module_ref
pm
=
ins
->
module_inputs
().
front
();
auto
n
=
std
::
count_if
(
pm
->
begin
(),
pm
->
end
(),
[
&
](
auto
&
i
)
{
return
i
.
name
()
==
s
;
});
if
(
n
!=
1
)
return
false
;
return
std
::
all_of
(
pm
->
begin
(),
pm
->
end
(),
[
&
](
auto
&
i
)
{
return
starts_with
(
i
.
name
(),
"@"
)
or
i
.
name
()
==
s
;
});
}));
}
struct
find_gemm_pointwise
{
auto
matcher
()
const
{
return
pointwise_name
(
"add"
)(
match
::
nargs
(
3
),
match
::
all_of
[
match
::
inputs
()](
match
::
standard_shape
()),
match
::
either_arg
(
0
,
1
)(
match
::
used_once
().
bind
(
"c"
),
match
::
name
(
"gpu::gemm"
)(
match
::
nargs
(
3
)).
bind
(
"gemm"
)));
}
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
gemm_ins
=
r
.
instructions
[
"gemm"
];
auto
c_ins
=
r
.
instructions
[
"c"
];
auto
gemm
=
any_cast
<
rocblas_gemm
<
op
::
dot
>>
(
gemm_ins
->
get_operator
());
// Already fused gemm
if
(
not
float_equal
(
gemm
.
beta
,
0
))
return
;
auto
inputs
=
gemm_ins
->
inputs
();
inputs
.
pop_back
();
inputs
.
push_back
(
c_ins
);
inputs
.
push_back
(
gemm_ins
->
inputs
().
back
());
gemm
.
beta
=
1
;
m
.
replace_instruction
(
ins
,
gemm
,
inputs
);
}
};
...
...
@@ -938,22 +980,22 @@ struct find_commutative_broadcast
return
match
::
name
(
"gpu::add"
,
"gpu::mul"
)(
match
::
arg
(
1
)(
match
::
broadcast_shape
()));
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
args
=
ins
->
inputs
();
move_broadcasted_back
(
args
);
p
.
replace_instruction
(
ins
,
ins
->
get_operator
(),
args
);
m
.
replace_instruction
(
ins
,
ins
->
get_operator
(),
args
);
}
};
void
fuse_ops
::
apply
(
module
&
p
)
const
void
fuse_ops
::
apply
(
module
&
m
)
const
{
match
::
find_matches
(
p
,
find_gelu
{},
find_gelu_new
{
fast_math
});
run_passes
(
p
,
{
dead_code_elimination
{}});
match
::
find_matches
(
p
,
find_triadd
{});
match
::
find_matches
(
p
,
match
::
find_matches
(
m
,
find_gelu
{},
find_gelu_new
{
fast_math
});
run_passes
(
m
,
{
dead_code_elimination
{}});
match
::
find_matches
(
m
,
find_triadd
{});
match
::
find_matches
(
m
,
find_layernorm
{},
find_conv_pointwise
{
ctx
},
find_conv_bias_relu
{
ctx
},
...
...
@@ -966,8 +1008,12 @@ void fuse_ops::apply(module& p) const
find_add_unary
{
"gpu::sigmoid"
,
hip_add_sigmoid
{},
hip_triadd_sigmoid
{}},
find_add_unary
{
"gpu::tanh"
,
hip_add_tanh
{},
hip_triadd_tanh
{}},
find_add_clip
{});
run_passes
(
p
,
{
dead_code_elimination
{}});
match
::
find_matches
(
p
,
find_triadd_layernorm
{},
find_gemm_add
{},
find_commutative_broadcast
{});
run_passes
(
m
,
{
dead_code_elimination
{}});
match
::
find_matches
(
m
,
find_triadd_layernorm
{},
find_gemm_add
{},
find_gemm_pointwise
{},
find_commutative_broadcast
{});
}
}
// namespace gpu
...
...
src/targets/gpu/gemm_impl.cpp
View file @
faefeef9
#include <rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type)
MIGRAPHX_THROW
(
"ROCBLAS_GEMM: data type not supported!"
);
}
void
blas_shape
(
const
shape
&
s
)
{
if
(
s
.
lens
().
size
()
<
2
)
return
;
if
(
std
::
none_of
(
s
.
strides
().
end
()
-
2
,
s
.
strides
().
end
(),
[
&
](
auto
i
)
{
return
i
==
1
;
}))
MIGRAPHX_THROW
(
"GPU_GEMM: needs to have one matrix stride as 1"
);
if
(
s
.
lens
().
size
()
<
3
)
return
;
shape
batch_shape
{
s
.
type
(),
{
s
.
lens
().
begin
(),
s
.
lens
().
end
()
-
2
},
{
s
.
strides
().
begin
(),
s
.
strides
().
end
()
-
2
}};
auto
batch_shapes
=
reduce_dims
({
batch_shape
});
if
(
batch_shapes
.
front
().
lens
().
size
()
!=
1
)
MIGRAPHX_THROW
(
"GPU_GEMM: Batch dimension is not collapsible"
);
}
template
<
class
R
,
class
...
Ts
,
class
...
Us
>
R
rocblas_invoke
(
R
(
*
f
)(
Ts
...),
Us
...
xs
)
{
...
...
@@ -36,6 +53,18 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
return
f
(
xs
...,
nullptr
,
nullptr
);
}
static
bool
is_transposed
(
const
shape
&
s
)
{
if
(
not
s
.
transposed
())
return
false
;
return
s
.
strides
().
back
()
!=
1
;
}
static
rocblas_int
get_batch_stride
(
const
argument
&
a
)
{
return
a
.
get_shape
().
strides
()[
a
.
get_shape
().
strides
().
size
()
-
3
];
}
template
<
class
T
>
void
gemm_impl
(
context
&
ctx
,
const
shape
&
output_shape
,
...
...
@@ -45,8 +74,8 @@ void gemm_impl(context& ctx,
bool
int8_x4_format
,
bool
compute_fp32
)
{
bool
transa
=
args
[
0
].
get_shape
()
.
transposed
(
);
bool
transb
=
args
[
1
].
get_shape
()
.
transposed
(
);
bool
transa
=
is_transposed
(
args
[
0
].
get_shape
());
bool
transb
=
is_transposed
(
args
[
1
].
get_shape
());
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
...
...
@@ -142,6 +171,9 @@ void gemm_impl(context& ctx,
}
else
{
auto
a_stride
=
get_batch_stride
(
args
[
0
]);
auto
b_stride
=
get_batch_stride
(
args
[
1
]);
auto
c_stride
=
get_batch_stride
(
args
[
2
]);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
...
...
@@ -153,20 +185,20 @@ void gemm_impl(context& ctx,
to_pointer
(
args
.
at
(
1
)),
arg_type
,
ldb
,
k
*
n
,
b_stride
,
to_pointer
(
args
.
at
(
0
)),
arg_type
,
lda
,
m
*
k
,
a_stride
,
beta_v
,
to_pointer
(
args
[
2
]),
output_type
,
ldc
,
m
*
n
,
c_stride
,
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
]),
output_type
,
ldc
,
m
*
n
,
c_stride
,
num_matrices
,
compute_type
,
rocblas_gemm_algo_standard
,
...
...
src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
View file @
faefeef9
...
...
@@ -11,7 +11,7 @@ struct module;
namespace
gpu
{
std
::
vector
<
stream_race
>
analyze_streams
(
const
module
&
p
);
std
::
vector
<
stream_race
>
analyze_streams
(
const
module
&
m
);
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
View file @
faefeef9
...
...
@@ -14,7 +14,7 @@ namespace gpu {
struct
eliminate_workspace
{
std
::
string
name
()
const
{
return
"eliminate_workspace"
;
}
void
apply
(
module
&
p
)
const
;
void
apply
(
module
&
m
)
const
;
};
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
View file @
faefeef9
...
...
@@ -16,7 +16,7 @@ struct fuse_ops
context
*
ctx
=
nullptr
;
bool
fast_math
=
true
;
std
::
string
name
()
const
{
return
"gpu::fuse_ops"
;
}
void
apply
(
module
&
p
)
const
;
void
apply
(
module
&
m
)
const
;
};
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/gemm.hpp
View file @
faefeef9
...
...
@@ -18,6 +18,8 @@ namespace gpu {
struct
context
;
void
blas_shape
(
const
shape
&
s
);
template
<
class
Op
>
struct
rocblas_gemm
{
...
...
@@ -50,13 +52,14 @@ struct rocblas_gemm
std
::
vector
<
shape
>
in_shapes
(
inputs
);
in_shapes
.
pop_back
();
check_shapes
{
in_shapes
,
*
this
}.
not_broadcasted
();
b
atch_not_transposed
(
inputs
[
0
].
strides
()
);
b
atch_not_transposed
(
inputs
[
1
].
strides
()
);
b
las_shape
(
inputs
[
0
]
);
b
las_shape
(
inputs
[
1
]
);
// if gemm and add are fused
if
(
not
float_equal
(
beta
,
0
)
)
if
(
in_shapes
.
size
()
>
2
)
{
auto
cmat_shape
=
in_shapes
.
back
();
in_shapes
.
pop_back
();
blas_shape
(
cmat_shape
);
auto
op_out_shape
=
op
.
compute_shape
(
in_shapes
);
if
(
cmat_shape
.
lens
()
!=
op_out_shape
.
lens
())
{
...
...
@@ -71,6 +74,7 @@ struct rocblas_gemm
to_string
(
cmat_shape
.
type
())
+
", it must be: "
+
to_string
(
op_out_shape
.
type
()));
}
return
op_out_shape
;
}
return
op
.
compute_shape
(
in_shapes
);
...
...
@@ -96,28 +100,6 @@ struct rocblas_gemm
return
args
.
back
();
}
void
batch_not_transposed
(
const
std
::
vector
<
std
::
size_t
>&
strides
)
const
{
if
(
strides
.
size
()
<=
2
)
return
;
auto
dim_0
=
strides
.
size
()
-
2
;
auto
matrix_size
=
std
::
max
(
strides
[
dim_0
],
strides
[
dim_0
+
1
]);
std
::
vector
<
std
::
size_t
>
batch
(
strides
.
begin
(),
strides
.
begin
()
+
dim_0
);
if
(
std
::
all_of
(
batch
.
begin
(),
batch
.
end
(),
[
&
](
auto
i
)
{
return
(
i
<
matrix_size
);
}))
{
MIGRAPHX_THROW
(
"GPU_GEMM: matrix size and batch size {"
+
to_string_range
(
strides
)
+
"} are transposed!"
);
}
if
(
std
::
adjacent_find
(
batch
.
begin
(),
batch
.
end
(),
[
&
](
auto
i
,
auto
j
)
{
return
(
i
<
j
or
i
<
matrix_size
or
j
<
matrix_size
);
})
!=
batch
.
end
())
{
MIGRAPHX_THROW
(
"GPU_GEMM: batch size {"
+
to_string_range
(
strides
)
+
"} is transposed!"
);
}
}
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
return
shapes
.
size
()
-
1
;
...
...
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
0 → 100644
View file @
faefeef9
#ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#include <migraphx/config.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
struct
module
;
namespace
gpu
{
struct
prefuse_ops
{
std
::
string
name
()
const
{
return
"gpu::prefuse_ops"
;
}
void
apply
(
module
&
m
)
const
;
};
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment