Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
eb0d8fee
Commit
eb0d8fee
authored
Jun 04, 2019
by
Paul
Browse files
Merge branch 'develop' into driver
parents
65ef35cd
0d796941
Changes
320
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1503 additions
and
368 deletions
+1503
-368
src/py/migraphx_py.cpp
src/py/migraphx_py.cpp
+53
-3
src/quantization.cpp
src/quantization.cpp
+107
-0
src/rewrite_rnn.cpp
src/rewrite_rnn.cpp
+526
-12
src/schedule.cpp
src/schedule.cpp
+434
-0
src/shape.cpp
src/shape.cpp
+1
-1
src/simplify_algebra.cpp
src/simplify_algebra.cpp
+1
-1
src/simplify_reshapes.cpp
src/simplify_reshapes.cpp
+7
-9
src/targets/cpu/gemm.cpp
src/targets/cpu/gemm.cpp
+42
-15
src/targets/cpu/include/migraphx/cpu/target.hpp
src/targets/cpu/include/migraphx/cpu/target.hpp
+1
-0
src/targets/cpu/lowering.cpp
src/targets/cpu/lowering.cpp
+209
-288
src/targets/cpu/target.cpp
src/targets/cpu/target.cpp
+4
-2
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+7
-0
src/targets/gpu/abs.cpp
src/targets/gpu/abs.cpp
+3
-6
src/targets/gpu/adjust_allocation.cpp
src/targets/gpu/adjust_allocation.cpp
+37
-0
src/targets/gpu/batchnorm.cpp
src/targets/gpu/batchnorm.cpp
+1
-4
src/targets/gpu/clip.cpp
src/targets/gpu/clip.cpp
+23
-0
src/targets/gpu/concat.cpp
src/targets/gpu/concat.cpp
+1
-4
src/targets/gpu/contiguous.cpp
src/targets/gpu/contiguous.cpp
+2
-4
src/targets/gpu/convolution.cpp
src/targets/gpu/convolution.cpp
+22
-19
src/targets/gpu/device/clip.cpp
src/targets/gpu/device/clip.cpp
+22
-0
No files found.
src/py/migraphx_py.cpp
View file @
eb0d8fee
...
...
@@ -2,10 +2,16 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <migraphx/program.hpp>
#include <migraphx/quantization.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/cpu/target.hpp>
#include <migraphx/onnx.hpp>
#include <migraphx/stringutils.hpp>
#ifdef ENABLE_TF
#include <migraphx/tf.hpp>
#else
#include <migraphx/onnx.hpp>
#endif
#ifdef HAVE_GPU
#include <migraphx/gpu/target.hpp>
#include <migraphx/gpu/hip.hpp>
...
...
@@ -28,6 +34,11 @@ struct throw_half
{
throw
std
::
runtime_error
(
"Half not supported in python yet."
);
}
void
operator
()(
migraphx
::
tensor_view
<
migraphx
::
half
>
)
const
{
throw
std
::
runtime_error
(
"Half not supported in python yet."
);
}
};
template
<
class
F
>
...
...
@@ -42,6 +53,8 @@ struct skip_half
}
void
operator
()(
migraphx
::
shape
::
as
<
migraphx
::
half
>
)
const
{}
void
operator
()(
migraphx
::
tensor_view
<
migraphx
::
half
>
)
const
{}
};
template
<
class
F
>
...
...
@@ -50,6 +63,12 @@ void visit_type(const migraphx::shape& s, F f)
s
.
visit_type
(
throw_half
<
F
>
{
f
});
}
template
<
class
T
,
class
F
>
void
visit
(
const
migraphx
::
raw_data
<
T
>&
x
,
F
f
)
{
x
.
visit
(
throw_half
<
F
>
{
f
});
}
template
<
class
F
>
void
visit_types
(
F
f
)
{
...
...
@@ -60,6 +79,9 @@ template <class T>
py
::
buffer_info
to_buffer_info
(
T
&
x
)
{
migraphx
::
shape
s
=
x
.
get_shape
();
auto
strides
=
s
.
strides
();
std
::
transform
(
strides
.
begin
(),
strides
.
end
(),
strides
.
begin
(),
[
&
](
auto
i
)
{
return
i
*
s
.
type_size
();
});
py
::
buffer_info
b
;
visit_type
(
s
,
[
&
](
auto
as
)
{
b
=
py
::
buffer_info
(
x
.
data
(),
...
...
@@ -67,7 +89,7 @@ py::buffer_info to_buffer_info(T& x)
py
::
format_descriptor
<
decltype
(
as
())
>::
format
(),
s
.
lens
().
size
(),
s
.
lens
(),
s
.
strides
()
);
strides
);
});
return
b
;
}
...
...
@@ -75,11 +97,20 @@ py::buffer_info to_buffer_info(T& x)
migraphx
::
shape
to_shape
(
const
py
::
buffer_info
&
info
)
{
migraphx
::
shape
::
type_t
t
;
std
::
size_t
n
=
0
;
visit_types
([
&
](
auto
as
)
{
if
(
info
.
format
==
py
::
format_descriptor
<
decltype
(
as
())
>::
format
())
{
t
=
as
.
type_enum
();
n
=
sizeof
(
as
());
}
});
auto
strides
=
info
.
strides
;
std
::
transform
(
strides
.
begin
(),
strides
.
end
(),
strides
.
begin
(),
[
&
](
auto
i
)
->
std
::
size_t
{
return
n
>
0
?
i
/
n
:
0
;
});
return
migraphx
::
shape
{
t
,
info
.
shape
,
info
.
strides
};
return
migraphx
::
shape
{
t
,
info
.
shape
,
strides
};
}
PYBIND11_MODULE
(
migraphx
,
m
)
...
...
@@ -108,6 +139,13 @@ PYBIND11_MODULE(migraphx, m)
py
::
buffer_info
info
=
b
.
request
();
new
(
&
x
)
migraphx
::
argument
(
to_shape
(
info
),
info
.
ptr
);
})
.
def
(
"get_shape"
,
&
migraphx
::
argument
::
get_shape
)
.
def
(
"tolist"
,
[](
migraphx
::
argument
&
x
)
{
py
::
list
l
{
x
.
get_shape
().
elements
()};
visit
(
x
,
[
&
](
auto
data
)
{
l
=
py
::
cast
(
data
.
to_vector
());
});
return
l
;
})
.
def
(
"__eq__"
,
std
::
equal_to
<
migraphx
::
argument
>
{})
.
def
(
"__ne__"
,
std
::
not_equal_to
<
migraphx
::
argument
>
{})
.
def
(
"__repr__"
,
[](
const
migraphx
::
argument
&
x
)
{
return
migraphx
::
to_string
(
x
);
});
...
...
@@ -123,8 +161,16 @@ PYBIND11_MODULE(migraphx, m)
.
def
(
"__ne__"
,
std
::
not_equal_to
<
migraphx
::
program
>
{})
.
def
(
"__repr__"
,
[](
const
migraphx
::
program
&
p
)
{
return
migraphx
::
to_string
(
p
);
});
#ifdef ENABLE_TF
m
.
def
(
"parse_tf"
,
&
migraphx
::
parse_tf
,
"Parse tf protobuf (default format is nhwc)"
,
py
::
arg
(
"filename"
),
py
::
arg
(
"is_nhwc"
)
=
true
);
#else
m
.
def
(
"parse_onnx"
,
&
migraphx
::
parse_onnx
);
#endif
m
.
def
(
"get_target"
,
[](
const
std
::
string
&
name
)
->
migraphx
::
target
{
if
(
name
==
"cpu"
)
return
migraphx
::
cpu
::
target
{};
...
...
@@ -136,6 +182,10 @@ PYBIND11_MODULE(migraphx, m)
});
m
.
def
(
"generate_argument"
,
&
migraphx
::
generate_argument
,
py
::
arg
(
"s"
),
py
::
arg
(
"seed"
)
=
0
);
m
.
def
(
"quantize"
,
[](
migraphx
::
program
&
p
,
std
::
vector
<
std
::
string
>&
ins_names
)
{
migraphx
::
quantize
(
p
,
ins_names
);
});
m
.
def
(
"quantize"
,
[](
migraphx
::
program
&
p
)
{
migraphx
::
quantize
(
p
,
{
"all"
});
});
#ifdef HAVE_GPU
m
.
def
(
"allocate_gpu"
,
&
migraphx
::
gpu
::
allocate_gpu
,
py
::
arg
(
"s"
),
py
::
arg
(
"host"
)
=
false
);
...
...
src/quantization.cpp
0 → 100644
View file @
eb0d8fee
#include <migraphx/quantization.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/op/convert.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/ranges.hpp>
#include <utility>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
instruction_ref
insert_fp16
(
program
&
prog
,
instruction_ref
&
ins
,
shape
::
type_t
type
,
std
::
unordered_map
<
instruction_ref
,
instruction_ref
>&
map_fp16
)
{
if
(
map_fp16
.
count
(
ins
)
>
0
)
{
return
map_fp16
[
ins
];
}
assert
(
ins
->
get_shape
().
type
()
==
shape
::
float_type
||
ins
->
get_shape
().
type
()
==
shape
::
double_type
);
instruction_ref
ins_fp16
{};
ins_fp16
=
prog
.
insert_instruction
(
std
::
next
(
ins
),
op
::
convert
{
type
},
ins
);
map_fp16
[
ins
]
=
ins_fp16
;
return
ins_fp16
;
}
void
quantize
(
program
&
prog
,
const
std
::
vector
<
std
::
string
>&
ins_names
)
{
std
::
unordered_map
<
instruction_ref
,
instruction_ref
>
map_fp16
;
for
(
auto
ins
:
iterator_for
(
prog
))
{
// all indicates every instruction is converted
if
((
not
contains
(
ins_names
,
"all"
))
and
(
not
contains
(
ins_names
,
ins
->
name
())))
{
continue
;
}
shape
::
type_t
orig_type
=
ins
->
get_shape
().
type
();
// process all inputs, if input is a fp32 or fp64, convert it
// to a fp16 by adding a convert operator.
auto
inputs
=
ins
->
inputs
();
std
::
vector
<
instruction_ref
>
converted_inputs
;
for
(
auto
input
:
inputs
)
{
auto
s
=
input
->
get_shape
();
if
(
s
.
type
()
==
shape
::
float_type
||
s
.
type
()
==
shape
::
double_type
)
{
// if the input is a convert operator, uses its input
// as its current input
instruction_ref
input_fp16
{};
if
(
input
->
name
()
==
"convert"
)
{
input_fp16
=
input
->
inputs
().
front
();
}
else
{
input_fp16
=
insert_fp16
(
prog
,
input
,
shape
::
half_type
,
map_fp16
);
}
converted_inputs
.
push_back
(
input_fp16
);
}
else
{
converted_inputs
.
push_back
(
input
);
}
}
// no change for the input, go to the next instruction
if
(
inputs
==
converted_inputs
)
{
continue
;
}
auto
op
=
ins
->
get_operator
();
auto
ins_shape
=
compute_shape
(
op
,
converted_inputs
);
if
(
ins_shape
.
type
()
!=
orig_type
)
{
// insert another convert instruction to convert it back
if
(
ins
==
std
::
prev
(
prog
.
end
()))
{
prog
.
add_instruction
(
op
::
convert
{
orig_type
},
ins
);
}
else
{
// check the dead code case to avoid assert
bool
output_empty
=
ins
->
outputs
().
empty
();
auto
ins_orig_type
=
prog
.
insert_instruction
(
std
::
next
(
ins
),
op
::
convert
{
orig_type
},
ins
);
if
(
!
output_empty
)
{
prog
.
replace_instruction
(
ins
,
ins_orig_type
);
}
}
}
prog
.
replace_instruction
(
ins
,
op
,
converted_inputs
);
}
}
void
quantize
(
program
&
prog
)
{
quantize
(
prog
,
{
"all"
});
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/rewrite_rnn.cpp
View file @
eb0d8fee
...
...
@@ -4,6 +4,7 @@
#include <migraphx/operators.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/op/common.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -16,11 +17,14 @@ void rewrite_rnn::apply(program& prog) const
{
apply_vanilla_rnn
(
prog
,
ins
);
}
if
(
ins
->
name
()
==
"gru"
)
else
if
(
ins
->
name
()
==
"gru"
)
{
apply_gru
(
prog
,
ins
);
}
else
if
(
ins
->
name
()
==
"lstm"
)
{
apply_lstm
(
prog
,
ins
);
}
}
}
...
...
@@ -210,7 +214,7 @@ std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
auto
wb
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
hs
}},
sbias
);
auto
rb
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
hs
},
{
2
*
hs
}},
sbias
);
auto
b
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
wb
,
rb
);
bias
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
sih
->
get_shape
()},
b
);
bias
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
sih
->
get_shape
()
.
lens
()
},
b
);
}
instruction_ref
hidden_out
=
prog
.
end
();
...
...
@@ -517,25 +521,26 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
instruction_ref
brcst_bh
{};
if
(
bias
!=
prog
.
end
())
{
auto
sbias
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
bias
);
auto
wbz
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
hs
}},
sbias
);
auto
wbr
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
hs
},
{
2
*
hs
}},
sbias
);
auto
wbh
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
2
*
hs
},
{
3
*
hs
}},
sbias
);
brcst_wbh
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
sih
->
get_shape
()},
wbh
);
auto
broadcast_lens
=
sih
->
get_shape
().
lens
();
auto
sbias
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
bias
);
auto
wbz
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
hs
}},
sbias
);
auto
wbr
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
hs
},
{
2
*
hs
}},
sbias
);
auto
wbh
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
2
*
hs
},
{
3
*
hs
}},
sbias
);
brcst_wbh
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
broadcast_lens
},
wbh
);
auto
rbz
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
3
*
hs
},
{
4
*
hs
}},
sbias
);
auto
rbr
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
4
*
hs
},
{
5
*
hs
}},
sbias
);
auto
rbh
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
5
*
hs
},
{
6
*
hs
}},
sbias
);
brcst_rbh
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
sih
->
get_shape
()
},
rbh
);
brcst_rbh
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
broadcast_lens
},
rbh
);
auto
bz
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
wbz
,
rbz
);
brcst_bz
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
sih
->
get_shape
()
},
bz
);
brcst_bz
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
broadcast_lens
},
bz
);
auto
br
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
wbr
,
rbr
);
brcst_br
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
sih
->
get_shape
()
},
br
);
brcst_br
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
broadcast_lens
},
br
);
auto
bh
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
wbh
,
rbh
);
brcst_bh
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
sih
->
get_shape
()
},
bh
);
brcst_bh
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
broadcast_lens
},
bh
);
}
for
(
long
i
=
0
;
i
<
seq_len
;
i
++
)
...
...
@@ -664,5 +669,514 @@ std::vector<operation> rewrite_rnn::gru_actv_funcs(instruction_ref ins) const
}
}
// for lstm operators
void
rewrite_rnn
::
apply_lstm
(
program
&
prog
,
instruction_ref
ins
)
const
{
assert
(
ins
->
name
()
==
"lstm"
);
auto
args
=
ins
->
inputs
();
shape
seq_shape
=
args
[
0
]
->
get_shape
();
std
::
size_t
hidden_size
=
args
[
2
]
->
get_shape
().
lens
()[
2
];
std
::
size_t
batch_size
=
seq_shape
.
lens
()[
1
];
shape
::
type_t
type
=
seq_shape
.
type
();
migraphx
::
shape
ihc_shape
{
type
,
{
1
,
batch_size
,
hidden_size
}};
std
::
vector
<
float
>
ihc_data
(
ihc_shape
.
elements
(),
0.0
);
migraphx
::
shape
pph_shape
{
type
,
{
1
,
3
*
hidden_size
}};
std
::
vector
<
float
>
pph_data
(
pph_shape
.
elements
(),
0.0
);
auto
actv_funcs
=
lstm_actv_funcs
(
ins
);
auto
lstm_op
=
any_cast
<
op
::
lstm
>
(
ins
->
get_operator
());
op
::
rnn_direction
dirct
=
lstm_op
.
direction
;
instruction_ref
last_output
{};
instruction_ref
last_cell_output
{};
if
(
dirct
==
op
::
rnn_direction
::
bidirectional
)
{
// input weight matrix
// input weight matrix
auto
w_forward
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
1
}},
args
[
1
]);
auto
w_reverse
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
1
},
{
2
}},
args
[
1
]);
// hidden state weight matrix
auto
r_forward
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
1
}},
args
[
2
]);
auto
r_reverse
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
1
},
{
2
}},
args
[
2
]);
// process bias
instruction_ref
bias_forward
=
prog
.
end
();
instruction_ref
bias_reverse
=
prog
.
end
();
if
(
args
.
size
()
>=
4
&&
args
[
3
]
->
name
()
!=
"undefined"
)
{
bias_forward
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
1
}},
args
[
3
]);
bias_reverse
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
1
},
{
2
}},
args
[
3
]);
}
// process intial hidden state, it is the 6th argument
instruction_ref
ih_forward
{};
instruction_ref
ih_reverse
{};
if
(
args
.
size
()
>=
6
&&
args
[
5
]
->
name
()
!=
"undefined"
)
{
ih_forward
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
1
}},
args
[
5
]);
ih_reverse
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
1
},
{
2
}},
args
[
5
]);
}
else
{
ih_forward
=
prog
.
add_literal
(
migraphx
::
literal
{
ihc_shape
,
ihc_data
});
ih_reverse
=
prog
.
add_literal
(
migraphx
::
literal
{
ihc_shape
,
ihc_data
});
}
// process initial cell value
instruction_ref
ic_forward
{};
instruction_ref
ic_reverse
{};
if
(
args
.
size
()
>=
7
&&
args
[
6
]
->
name
()
!=
"undefined"
)
{
ic_forward
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
1
}},
args
[
6
]);
ic_reverse
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
1
},
{
2
}},
args
[
6
]);
}
else
{
ic_forward
=
prog
.
add_literal
(
migraphx
::
literal
{
ihc_shape
,
ihc_data
});
ic_reverse
=
prog
.
add_literal
(
migraphx
::
literal
{
ihc_shape
,
ihc_data
});
}
// process weight of the peephole
instruction_ref
pph_forward
=
prog
.
end
();
instruction_ref
pph_reverse
=
prog
.
end
();
if
(
args
.
size
()
==
8
&&
args
[
7
]
->
name
()
!=
"undefined"
)
{
pph_forward
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
1
}},
args
[
7
]);
pph_reverse
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
1
},
{
2
}},
args
[
7
]);
}
auto
ret_forward
=
lstm_cell
(
true
,
prog
,
ins
,
{
args
[
0
],
w_forward
,
r_forward
,
bias_forward
,
ih_forward
,
ic_forward
,
pph_forward
},
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
2
));
auto
ret_reverse
=
lstm_cell
(
false
,
prog
,
ins
,
{
args
[
0
],
w_reverse
,
r_reverse
,
bias_reverse
,
ih_reverse
,
ic_reverse
,
pph_reverse
},
actv_funcs
.
at
(
3
),
actv_funcs
.
at
(
4
),
actv_funcs
.
at
(
5
));
auto
concat_output
=
prog
.
insert_instruction
(
ins
,
op
::
concat
{
1
},
ret_forward
[
1
],
ret_reverse
[
1
]);
last_output
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
concat_output
);
// last cell output
last_cell_output
=
prog
.
insert_instruction
(
ins
,
op
::
concat
{
0
},
ret_forward
[
2
],
ret_reverse
[
2
]);
// the following logic is to ensure the last instruction is a concat
if
(
ret_forward
[
0
]
==
prog
.
end
())
{
prog
.
replace_instruction
(
ins
,
op
::
concat
{
1
},
ret_forward
[
1
],
ret_reverse
[
1
]);
}
else
{
ret_forward
[
0
]
=
prog
.
insert_instruction
(
ins
,
op
::
concat
{
0
},
ret_forward
[
0
],
ret_forward
[
1
]);
ret_reverse
[
0
]
=
prog
.
insert_instruction
(
ins
,
op
::
concat
{
0
},
ret_reverse
[
1
],
ret_reverse
[
0
]);
prog
.
replace_instruction
(
ins
,
op
::
concat
{
1
},
{
ret_forward
[
0
],
ret_reverse
[
0
]});
}
}
else
{
bool
is_forward
=
(
dirct
==
op
::
rnn_direction
::
forward
);
// weight matrices
auto
w
=
args
[
1
];
auto
r
=
args
[
2
];
// bias
instruction_ref
bias
=
prog
.
end
();
if
(
args
.
size
()
>=
4
&&
args
[
3
]
->
name
()
!=
"undefined"
)
{
bias
=
args
[
3
];
}
// initial hidden state
instruction_ref
ih
{};
if
(
args
.
size
()
>=
6
&&
args
[
5
]
->
name
()
!=
"undefined"
)
{
ih
=
args
[
5
];
}
else
{
ih
=
prog
.
add_literal
(
migraphx
::
literal
{
ihc_shape
,
ihc_data
});
}
// initial cell value
instruction_ref
ic
{};
if
(
args
.
size
()
>=
7
&&
args
[
6
]
->
name
()
!=
"undefined"
)
{
ic
=
args
[
6
];
}
else
{
ic
=
prog
.
add_literal
(
migraphx
::
literal
{
ihc_shape
,
ihc_data
});
}
// process weight of the peephole
instruction_ref
pph
=
prog
.
end
();
if
(
args
.
size
()
==
8
&&
args
[
7
]
->
name
()
!=
"undefined"
)
{
pph
=
args
[
7
];
}
auto
ret
=
lstm_cell
(
is_forward
,
prog
,
ins
,
{
args
[
0
],
w
,
r
,
bias
,
ih
,
ic
,
pph
},
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
2
));
last_output
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
ret
[
1
]);
last_cell_output
=
ret
[
2
];
if
(
ret
[
0
]
==
prog
.
end
())
{
prog
.
replace_instruction
(
ins
,
op
::
concat
{
0
},
ret
[
1
]);
}
else
{
auto
concat_arg0
=
is_forward
?
ret
[
0
]
:
ret
[
1
];
auto
concat_arg1
=
is_forward
?
ret
[
1
]
:
ret
[
0
];
prog
.
replace_instruction
(
ins
,
op
::
concat
{
0
},
concat_arg0
,
concat_arg1
);
}
}
// replace the corresponding lstm_last_output instruction
// with the last_output, and the lstm_last_cell_output with
// the last_cell_output. The while loop is to handle the case
// of multiple lstm_last_output and lstm_last_cell_output
// operators
auto
last_output_it
=
ins
->
outputs
().
begin
();
while
(
last_output_it
!=
ins
->
outputs
().
end
())
{
last_output_it
=
std
::
find_if
(
last_output_it
,
ins
->
outputs
().
end
(),
[](
auto
i
)
{
return
i
->
name
()
==
"rnn_last_output"
;
});
if
(
last_output_it
!=
ins
->
outputs
().
end
())
{
prog
.
replace_instruction
(
*
last_output_it
,
last_output
);
last_output_it
++
;
}
}
auto
last_cell_output_it
=
ins
->
outputs
().
begin
();
while
(
last_cell_output_it
!=
ins
->
outputs
().
end
())
{
last_cell_output_it
=
std
::
find_if
(
last_cell_output_it
,
ins
->
outputs
().
end
(),
[](
auto
i
)
{
return
i
->
name
()
==
"lstm_last_cell_output"
;
});
if
(
last_cell_output_it
!=
ins
->
outputs
().
end
())
{
prog
.
replace_instruction
(
*
last_cell_output_it
,
last_cell_output
);
last_cell_output_it
++
;
}
}
}
std
::
vector
<
instruction_ref
>
rewrite_rnn
::
lstm_cell
(
bool
is_forward
,
program
&
prog
,
instruction_ref
ins
,
std
::
vector
<
instruction_ref
>
inputs
,
const
operation
&
actv_func1
,
const
operation
&
actv_func2
,
const
operation
&
actv_func3
)
const
{
// must have 7 args in the input vector
assert
(
inputs
.
size
()
==
7
);
auto
seq
=
inputs
.
at
(
0
);
auto
w
=
inputs
.
at
(
1
);
auto
r
=
inputs
.
at
(
2
);
auto
bias
=
inputs
.
at
(
3
);
auto
ih
=
inputs
.
at
(
4
);
auto
ic
=
inputs
.
at
(
5
);
auto
pph
=
inputs
.
at
(
6
);
instruction_ref
hidden_states
=
prog
.
end
();
instruction_ref
last_output
{};
instruction_ref
last_cell_output
{};
migraphx
::
shape
seq_shape
=
seq
->
get_shape
();
migraphx
::
shape
r_shape
=
r
->
get_shape
();
long
seq_len
=
static_cast
<
long
>
(
seq_shape
.
lens
()[
0
]);
long
hs
=
static_cast
<
long
>
(
r_shape
.
lens
()[
2
]);
std
::
vector
<
int64_t
>
perm
{
1
,
0
};
// w matrix
auto
sw
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
w
);
auto
wi
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
hs
}},
sw
);
auto
tran_wi
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
wi
);
auto
wo
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
hs
},
{
2
*
hs
}},
sw
);
auto
tran_wo
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
wo
);
auto
wf
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
2
*
hs
},
{
3
*
hs
}},
sw
);
auto
tran_wf
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
wf
);
auto
wc
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
3
*
hs
},
{
4
*
hs
}},
sw
);
auto
tran_wc
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
wc
);
// r matrix
auto
sr
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
r
);
auto
ri
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
hs
}},
sr
);
auto
tran_ri
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
ri
);
auto
ro
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
hs
},
{
2
*
hs
}},
sr
);
auto
tran_ro
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
ro
);
auto
rf
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
2
*
hs
},
{
3
*
hs
}},
sr
);
auto
tran_rf
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
rf
);
auto
rc
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
3
*
hs
},
{
4
*
hs
}},
sr
);
auto
tran_rc
=
prog
.
insert_instruction
(
ins
,
op
::
transpose
{
perm
},
rc
);
// initial hidden state
auto
sih
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
ih
);
// initial cell state
auto
sic
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
ic
);
auto
ic_lens
=
sic
->
get_shape
().
lens
();
// bias
instruction_ref
bi_brcst
{};
instruction_ref
bo_brcst
{};
instruction_ref
bf_brcst
{};
instruction_ref
bc_brcst
{};
if
(
bias
!=
prog
.
end
())
{
auto
sbias
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
bias
);
auto
bxi
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
hs
}},
sbias
);
auto
bhi
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
4
*
hs
},
{
5
*
hs
}},
sbias
);
auto
bi
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
bxi
,
bhi
);
bi_brcst
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
ic_lens
},
bi
);
auto
bxo
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
hs
},
{
2
*
hs
}},
sbias
);
auto
bho
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
5
*
hs
},
{
6
*
hs
}},
sbias
);
auto
bo
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
bxo
,
bho
);
bo_brcst
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
ic_lens
},
bo
);
auto
bxf
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
2
*
hs
},
{
3
*
hs
}},
sbias
);
auto
bhf
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
6
*
hs
},
{
7
*
hs
}},
sbias
);
auto
bf
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
bxf
,
bhf
);
bf_brcst
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
ic_lens
},
bf
);
auto
bxc
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
3
*
hs
},
{
4
*
hs
}},
sbias
);
auto
bhc
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
7
*
hs
},
{
8
*
hs
}},
sbias
);
auto
bc
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
bxc
,
bhc
);
bc_brcst
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
ic_lens
},
bc
);
}
// peep hole
instruction_ref
pphi_brcst
{};
instruction_ref
ppho_brcst
{};
instruction_ref
pphf_brcst
{};
if
(
pph
!=
prog
.
end
())
{
auto
spph
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
pph
);
auto
pphi
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
0
},
{
hs
}},
spph
);
pphi_brcst
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
ic_lens
},
pphi
);
auto
ppho
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
hs
},
{
2
*
hs
}},
spph
);
ppho_brcst
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
ic_lens
},
ppho
);
auto
pphf
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
2
*
hs
},
{
3
*
hs
}},
spph
);
pphf_brcst
=
prog
.
insert_instruction
(
ins
,
op
::
broadcast
{
1
,
ic_lens
},
pphf
);
}
for
(
long
i
=
0
;
i
<
seq_len
;
++
i
)
{
long
seq_index
=
is_forward
?
i
:
(
seq_len
-
1
-
i
);
auto
xt
=
prog
.
insert_instruction
(
ins
,
op
::
slice
{{
0
},
{
seq_index
},
{
seq_index
+
1
}},
seq
);
xt
=
prog
.
insert_instruction
(
ins
,
op
::
squeeze
{{
0
}},
xt
);
// equation it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
auto
xt_wi
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
xt
,
tran_wi
);
auto
ht_ri
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
sih
,
tran_ri
);
auto
it_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
xt_wi
,
ht_ri
);
if
(
pph
!=
prog
.
end
())
{
auto
pphi_ct
=
prog
.
insert_instruction
(
ins
,
op
::
mul
{},
pphi_brcst
,
sic
);
it_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
it_before_actv
,
pphi_ct
);
}
if
(
bias
!=
prog
.
end
())
{
it_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
it_before_actv
,
bi_brcst
);
}
auto
it
=
prog
.
insert_instruction
(
ins
,
actv_func1
,
it_before_actv
);
// equation ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
auto
xt_wf
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
xt
,
tran_wf
);
auto
ht_rf
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
sih
,
tran_rf
);
auto
ft_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
xt_wf
,
ht_rf
);
if
(
pph
!=
prog
.
end
())
{
auto
pphf_ct
=
prog
.
insert_instruction
(
ins
,
op
::
mul
{},
pphf_brcst
,
sic
);
ft_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
ft_before_actv
,
pphf_ct
);
}
if
(
bias
!=
prog
.
end
())
{
ft_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
ft_before_actv
,
bf_brcst
);
}
auto
ft
=
prog
.
insert_instruction
(
ins
,
actv_func1
,
ft_before_actv
);
// equation ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
auto
xt_wc
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
xt
,
tran_wc
);
auto
ht_rc
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
sih
,
tran_rc
);
auto
ct_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
xt_wc
,
ht_rc
);
if
(
bias
!=
prog
.
end
())
{
ct_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
ct_before_actv
,
bc_brcst
);
}
auto
ct
=
prog
.
insert_instruction
(
ins
,
actv_func2
,
ct_before_actv
);
// equation Ct = ft (.) Ct-1 + it (.) ct
auto
ft_cell
=
prog
.
insert_instruction
(
ins
,
op
::
mul
{},
ft
,
sic
);
auto
it_ct
=
prog
.
insert_instruction
(
ins
,
op
::
mul
{},
it
,
ct
);
auto
cellt
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
ft_cell
,
it_ct
);
last_cell_output
=
cellt
;
// ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
auto
xt_wo
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
xt
,
tran_wo
);
auto
ht_ro
=
prog
.
insert_instruction
(
ins
,
op
::
dot
{},
sih
,
tran_ro
);
auto
ot_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
xt_wo
,
ht_ro
);
if
(
pph
!=
prog
.
end
())
{
auto
ppho_cellt
=
prog
.
insert_instruction
(
ins
,
op
::
mul
{},
ppho_brcst
,
cellt
);
ot_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
ot_before_actv
,
ppho_cellt
);
}
if
(
bias
!=
prog
.
end
())
{
ot_before_actv
=
prog
.
insert_instruction
(
ins
,
op
::
add
{},
ot_before_actv
,
bo_brcst
);
}
auto
ot
=
prog
.
insert_instruction
(
ins
,
actv_func1
,
ot_before_actv
);
// Ht = ot (.) h(Ct)
auto
h_cellt
=
prog
.
insert_instruction
(
ins
,
actv_func3
,
cellt
);
auto
ht
=
prog
.
insert_instruction
(
ins
,
op
::
mul
{},
ot
,
h_cellt
);
sic
=
cellt
;
sih
=
ht
;
last_output
=
prog
.
insert_instruction
(
ins
,
op
::
unsqueeze
{{
0
,
1
}},
ht
);
if
(
i
<
seq_len
-
1
)
{
if
(
i
==
0
)
{
hidden_states
=
last_output
;
}
else
{
auto
concat_arg0
=
is_forward
?
hidden_states
:
last_output
;
auto
concat_arg1
=
is_forward
?
last_output
:
hidden_states
;
hidden_states
=
prog
.
insert_instruction
(
ins
,
op
::
concat
{
0
},
concat_arg0
,
concat_arg1
);
}
}
}
last_cell_output
=
prog
.
insert_instruction
(
ins
,
op
::
unsqueeze
{{
0
}},
last_cell_output
);
return
{
hidden_states
,
last_output
,
last_cell_output
};
}
std
::
vector
<
operation
>
rewrite_rnn
::
lstm_actv_funcs
(
instruction_ref
ins
)
const
{
auto
lstm_op
=
any_cast
<
op
::
lstm
>
(
ins
->
get_operator
());
// before rewrite the lstm operator, need to ensure
// we have 6 actv funcs, even though a user does not
// specifiy any actv func. If less than 46, use the
// algorithm in parse_lstm to make 6 actv functions
const
auto
&
actv_funcs
=
lstm_op
.
actv_funcs
;
std
::
size_t
num_actv_funcs
=
actv_funcs
.
size
();
if
(
lstm_op
.
direction
==
op
::
rnn_direction
::
bidirectional
)
{
switch
(
num_actv_funcs
)
{
case
0
:
return
{
op
::
sigmoid
{},
op
::
tanh
{},
op
::
tanh
{},
op
::
sigmoid
{},
op
::
tanh
{},
op
::
tanh
{}};
case
1
:
return
{
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
0
)};
case
2
:
return
{
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
1
)};
case
3
:
return
{
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
2
),
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
2
)};
case
4
:
return
{
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
2
),
actv_funcs
.
at
(
3
),
actv_funcs
.
at
(
3
),
actv_funcs
.
at
(
3
)};
case
5
:
return
{
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
2
),
actv_funcs
.
at
(
3
),
actv_funcs
.
at
(
4
),
actv_funcs
.
at
(
4
)};
default:
return
actv_funcs
;
}
}
else
{
switch
(
num_actv_funcs
)
{
case
0
:
return
{
op
::
sigmoid
{},
op
::
tanh
{},
op
::
tanh
{}};
case
1
:
return
{
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
0
)};
case
2
:
return
{
actv_funcs
.
at
(
0
),
actv_funcs
.
at
(
1
),
actv_funcs
.
at
(
1
)};
default:
return
actv_funcs
;
}
}
}
namespace
op
{
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
rnn_direction
v
)
{
std
::
vector
<
std
::
string
>
rnn_direction_str
=
{
"forward"
,
"reverse"
,
"bidirectional"
};
os
<<
rnn_direction_str
[
static_cast
<
std
::
underlying_type
<
rnn_direction
>::
type
>
(
v
)];
return
os
;
}
}
// namespace op
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/schedule.cpp
0 → 100644
View file @
eb0d8fee
#include <migraphx/schedule.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/op/identity.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/functional.hpp>
#include <migraphx/ranges.hpp>
#include <unordered_map>
#include <unordered_set>
#include <set>
#include <deque>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
auto
get_inputs
()
{
return
[](
auto
i
)
{
return
i
->
inputs
();
};
}
auto
get_outputs
()
{
return
[](
auto
i
)
{
return
i
->
outputs
();
};
}
struct
stream_info
{
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
ins2stream
;
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
weights
;
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
iweights
;
void
accumulate_weights
(
instruction_ref
last
,
const
schedule_model
&
model
)
{
fix
<
std
::
size_t
>
([
&
](
auto
self
,
auto
ins
)
->
std
::
size_t
{
if
(
not
contains
(
weights
,
ins
))
{
std
::
size_t
weight
=
0
;
auto
&&
op
=
ins
->
get_operator
();
if
(
not
is_context_free
(
op
)
and
op
.
name
()[
0
]
!=
'@'
)
weight
=
model
.
weight
(
op
);
iweights
[
ins
]
=
weight
;
weights
[
ins
]
=
std
::
accumulate
(
ins
->
inputs
().
begin
(),
ins
->
inputs
().
end
(),
weight
,
[
&
](
std
::
size_t
w
,
instruction_ref
i
)
{
return
w
+
self
(
i
);
});
}
return
weights
[
ins
];
})(
last
);
}
std
::
vector
<
instruction_ref
>::
iterator
sort_args
(
std
::
vector
<
instruction_ref
>&
args
)
{
if
(
args
.
size
()
<
2
)
{
return
args
.
end
();
}
const
std
::
size_t
min_partition_threshold
=
2
;
auto
compare
=
by
(
std
::
greater
<>
{},
[
&
](
auto
x
)
{
return
std
::
make_tuple
(
this
->
weights
[
x
],
x
->
inputs
().
size
());
});
std
::
sort
(
args
.
begin
(),
args
.
end
(),
compare
);
auto
it
=
std
::
lower_bound
(
std
::
next
(
args
.
begin
()),
args
.
end
(),
min_partition_threshold
,
[
&
](
auto
i
,
std
::
size_t
w
)
{
return
this
->
weights
[
i
]
>
w
;
});
assert
(
it
==
args
.
end
()
or
this
->
weights
[
*
it
]
<=
min_partition_threshold
);
assert
(
it
==
args
.
end
()
or
std
::
prev
(
it
)
==
args
.
begin
()
or
this
->
weights
[
*
std
::
prev
(
it
)]
>
min_partition_threshold
);
return
it
;
}
struct
partition
{
std
::
size_t
weight
=
0
;
std
::
vector
<
instruction_ref
>
instructions
{};
void
add
(
instruction_ref
ins
,
std
::
size_t
w
)
{
weight
+=
w
;
instructions
.
push_back
(
ins
);
}
};
void
assign_streams
(
program
&
p
,
std
::
size_t
n
)
{
partition
critical
;
std
::
unordered_map
<
instruction_ref
,
std
::
deque
<
partition
>>
partitions
;
partitions
.
reserve
(
weights
.
size
());
fix
([
&
](
auto
self
,
auto
ins
,
auto
&
part
)
{
assert
(
ins
!=
p
.
end
());
if
(
contains
(
partitions
,
ins
))
return
;
assert
(
p
.
has_instruction
(
ins
));
// Add an entry so we know the instruction was visited
partitions
[
ins
];
part
.
add
(
ins
,
this
->
iweights
[
ins
]);
auto
args
=
ins
->
inputs
();
auto
threshold_it
=
this
->
sort_args
(
args
);
if
(
not
args
.
empty
())
{
assert
(
threshold_it
!=
args
.
begin
());
self
(
args
.
front
(),
part
);
for
(
auto
i
:
range
(
std
::
next
(
args
.
begin
()),
threshold_it
))
{
partitions
[
ins
].
emplace_back
();
self
(
i
,
partitions
[
ins
].
back
());
}
for
(
auto
i
:
range
(
threshold_it
,
args
.
end
()))
{
self
(
i
,
part
);
}
}
// Sort instructions
p
.
move_instruction
(
ins
,
p
.
end
());
})(
std
::
prev
(
p
.
end
()),
critical
);
// Set the critical partition to stream 0
set_stream
(
critical
,
0
);
std
::
vector
<
std
::
size_t
>
streams
(
n
-
1
);
// Assign streams for the other partitions
for
(
auto
&&
ins_part
:
partitions
)
{
std
::
sort
(
ins_part
.
second
.
begin
(),
ins_part
.
second
.
end
(),
by
(
std
::
greater
<>
{},
[](
auto
&&
x
)
{
return
std
::
make_tuple
(
x
.
weight
,
x
.
instructions
.
size
());
}));
for
(
auto
&&
part
:
ins_part
.
second
)
{
auto
stream
=
std
::
min_element
(
streams
.
begin
(),
streams
.
end
())
-
streams
.
begin
();
set_stream
(
part
,
stream
+
1
);
streams
[
stream
]
+=
part
.
weight
;
}
}
}
void
set_stream
(
const
partition
&
p
,
std
::
size_t
n
)
{
for
(
auto
ins
:
p
.
instructions
)
if
(
iweights
[
ins
]
>
0
)
set_stream
(
ins
,
n
);
}
void
set_stream
(
instruction_ref
ins
,
std
::
size_t
n
)
{
assert
(
iweights
[
ins
]
>
0
);
ins2stream
[
ins
]
=
n
;
}
std
::
size_t
get_stream
(
instruction_ref
ins
)
const
{
return
ins2stream
.
at
(
ins
);
}
bool
has_stream
(
instruction_ref
ins
)
const
{
return
contains
(
ins2stream
,
ins
);
}
template
<
class
F
>
bool
different
(
F
f
,
std
::
size_t
stream
)
const
{
bool
result
=
false
;
f
([
&
](
auto
s
)
{
if
(
s
!=
stream
)
{
result
=
true
;
return
false
;
}
// cppcheck-suppress uselessAssignmentArg
stream
=
s
;
return
true
;
});
return
result
;
}
template
<
class
F
>
bool
different
(
F
f
)
const
{
bool
result
=
false
;
f
([
&
](
auto
s
)
{
result
=
this
->
different
(
f
,
s
);
return
false
;
});
return
result
;
}
template
<
class
Selector
>
auto
get_streams_from
(
instruction_ref
start
,
Selector
select
)
const
{
return
[
=
](
auto
f
)
{
return
fix
<
bool
>
([
&
](
auto
self
,
auto
ins
)
{
for
(
auto
i
:
select
(
ins
))
{
if
(
iweights
.
at
(
i
)
==
0
)
{
if
(
not
self
(
i
))
return
false
;
}
else
{
if
(
not
f
(
this
->
get_stream
(
i
)))
return
false
;
}
}
return
true
;
})(
start
);
};
}
std
::
unordered_set
<
std
::
size_t
>
get_streams
(
instruction_ref
ins
)
const
{
if
(
has_stream
(
ins
))
return
{
get_stream
(
ins
)};
std
::
unordered_set
<
std
::
size_t
>
result
;
get_streams_from
(
ins
,
get_inputs
())([
&
](
auto
s
)
{
result
.
insert
(
s
);
return
true
;
});
return
result
;
}
template
<
class
...
Ts
>
bool
is_merge_point
(
instruction_ref
ins
,
Ts
...
xs
)
const
{
return
different
(
get_streams_from
(
ins
,
get_inputs
()),
xs
...);
}
template
<
class
...
Ts
>
bool
is_split_point
(
instruction_ref
ins
,
Ts
...
xs
)
const
{
return
different
(
get_streams_from
(
ins
,
get_outputs
()),
xs
...);
}
std
::
vector
<
instruction_ref
>
get_recorded_instructions
(
instruction_ref
start
)
{
std
::
vector
<
instruction_ref
>
result
;
std
::
unordered_map
<
std
::
size_t
,
instruction_ref
>
m
;
fix
([
&
](
auto
self
,
auto
ins
)
{
for
(
auto
i
:
ins
->
inputs
())
{
if
(
iweights
.
at
(
i
)
==
0
)
{
self
(
i
);
continue
;
}
auto
stream
=
this
->
get_stream
(
i
);
if
(
not
contains
(
m
,
stream
))
m
[
stream
]
=
i
;
else
m
[
stream
]
=
std
::
min
(
m
[
stream
],
i
,
by
(
std
::
less
<>
{},
[
&
](
auto
x
)
{
return
std
::
distance
(
x
,
start
);
}));
}
})(
start
);
std
::
transform
(
m
.
begin
(),
m
.
end
(),
std
::
back_inserter
(
result
),
[](
auto
&&
p
)
{
return
p
.
second
;
});
return
result
;
}
std
::
unordered_map
<
instruction_ref
,
std
::
vector
<
std
::
vector
<
instruction_ref
>>>
find_concurrent_instructions
(
program
&
p
)
{
std
::
unordered_map
<
instruction_ref
,
std
::
vector
<
std
::
vector
<
instruction_ref
>>>
result
;
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
instruction_ref
>>
merge_from
;
result
.
reserve
(
p
.
size
());
merge_from
.
reserve
(
p
.
size
());
for
(
auto
ins
:
reverse_iterator_for
(
p
))
{
for
(
auto
&&
arg
:
ins
->
outputs
())
{
if
(
is_merge_point
(
arg
))
merge_from
[
ins
].
insert
(
arg
);
merge_from
[
ins
].
insert
(
merge_from
[
arg
].
begin
(),
merge_from
[
arg
].
end
());
}
auto
streams
=
this
->
get_streams
(
ins
);
// Collect concur instructions for each merge point.
for
(
auto
&
merge
:
merge_from
[
ins
])
{
for
(
auto
stream
:
streams
)
{
if
(
result
[
merge
].
size
()
<=
stream
)
result
[
merge
].
resize
(
stream
+
1
);
auto
&&
r
=
result
[
merge
][
stream
];
r
.
push_back
(
ins
);
// Copy inputs if they dont have a stream(and are not a builtin and context
// free). Inputs without a stream can have a implicit dependency
std
::
copy_if
(
ins
->
inputs
().
begin
(),
ins
->
inputs
().
end
(),
std
::
back_inserter
(
r
),
[
&
](
auto
x
)
{
return
not
this
->
has_stream
(
x
)
and
not
is_context_free
(
x
->
get_operator
())
and
x
->
name
().
front
()
!=
'@'
;
});
}
}
}
return
result
;
}
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
instruction_ref
>>
get_conflicts
(
program
&
p
)
{
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
instruction_ref
>>
conflict_table
;
auto
concur_ins
=
this
->
find_concurrent_instructions
(
p
);
for
(
auto
&&
merge
:
concur_ins
)
{
dfor
(
merge
.
second
.
size
(),
merge
.
second
.
size
())([
&
](
auto
i
,
auto
j
)
{
if
(
i
==
j
)
return
;
for
(
auto
ins1
:
merge
.
second
[
i
])
{
auto
p1
=
std
::
distance
(
ins1
,
merge
.
first
);
for
(
auto
ins2
:
merge
.
second
[
j
])
{
if
(
ins1
==
ins2
)
continue
;
auto
p2
=
std
::
distance
(
ins2
,
merge
.
first
);
// The smaller distance means the instruction occurs later
if
(
p1
>
p2
)
conflict_table
[
ins2
].
insert
(
ins1
);
else
conflict_table
[
ins1
].
insert
(
ins2
);
}
}
});
}
// Remove duplicates
for
(
auto
&&
ip
:
conflict_table
)
{
auto
ins1
=
ip
.
first
;
for
(
auto
ins2
:
ip
.
second
)
if
(
contains
(
conflict_table
[
ins2
],
ins1
))
conflict_table
[
ins2
].
erase
(
ins1
);
}
return
conflict_table
;
}
};
void
schedule
::
apply
(
program
&
p
)
const
{
if
(
not
enable
)
return
;
stream_info
si
;
auto
last
=
std
::
prev
(
p
.
end
());
si
.
accumulate_weights
(
last
,
model
);
si
.
assign_streams
(
p
,
model
.
concurrency
());
if
(
enabled
(
MIGRAPHX_TRACE_COMPILE
{}))
{
p
.
annotate
(
std
::
cout
,
[
&
](
auto
ins
)
{
std
::
cout
<<
":"
;
std
::
cout
<<
" weight="
<<
si
.
weights
.
at
(
ins
);
std
::
cout
<<
" input={"
;
si
.
get_streams_from
(
ins
,
get_inputs
())([
&
](
auto
s
)
{
std
::
cout
<<
s
<<
","
;
return
true
;
});
std
::
cout
<<
"}"
;
if
(
si
.
has_stream
(
ins
))
std
::
cout
<<
" stream="
<<
si
.
get_stream
(
ins
);
});
std
::
cout
<<
std
::
endl
;
}
// Schedule instructions
std
::
size_t
wait_id
=
0
;
std
::
unordered_map
<
instruction_ref
,
std
::
size_t
>
ins2wait
;
std
::
unordered_map
<
std
::
size_t
,
std
::
unordered_set
<
std
::
size_t
>>
waited_for
;
std
::
unordered_map
<
instruction_ref
,
std
::
unordered_set
<
std
::
size_t
>>
ins2waited
;
ins2wait
.
reserve
(
p
.
size
());
ins2waited
.
reserve
(
p
.
size
());
for
(
auto
ins
:
iterator_for
(
p
))
{
// Only schedule instructions that have a stream
if
(
not
si
.
has_stream
(
ins
))
continue
;
assert
(
si
.
weights
[
ins
]
>
0
);
// Schedule instruction on the stream
auto
stream
=
si
.
get_stream
(
ins
);
assert
(
stream
<
model
.
concurrency
());
model
.
sched
(
p
,
ins
,
stream
);
// Insert wait instructions
if
(
si
.
is_merge_point
(
ins
,
stream
))
{
for
(
auto
i
:
si
.
get_recorded_instructions
(
ins
))
{
if
(
not
si
.
has_stream
(
i
))
continue
;
auto
istream
=
si
.
get_stream
(
i
);
if
(
stream
==
istream
)
continue
;
// Create a new event if it hasn't been recorded
if
(
not
contains
(
ins2wait
,
i
))
{
ins2wait
[
i
]
=
wait_id
;
model
.
record
(
p
,
i
,
wait_id
);
wait_id
++
;
}
auto
w
=
ins2wait
.
at
(
i
);
// If we already waited for the event on this stream then dont
// insert another wait event
if
(
not
contains
(
waited_for
[
stream
],
w
))
model
.
wait
(
p
,
ins
,
w
);
// Store the event as waited
waited_for
[
stream
].
insert
(
w
);
// Store all wait events that have been waited on prior to the recorded instruction
waited_for
[
stream
].
insert
(
ins2waited
[
i
].
begin
(),
ins2waited
[
i
].
end
());
}
}
// Store wait events that have already been waited on
if
(
si
.
is_split_point
(
ins
,
stream
))
{
ins2waited
[
ins
]
=
waited_for
[
stream
];
}
}
// Add memory conflicts
auto
conflict_table
=
si
.
get_conflicts
(
p
);
for
(
auto
&&
ip
:
conflict_table
)
{
if
(
ip
.
second
.
empty
())
continue
;
std
::
vector
<
instruction_ref
>
args
;
args
.
push_back
(
ip
.
first
);
args
.
insert
(
args
.
end
(),
ip
.
second
.
begin
(),
ip
.
second
.
end
());
p
.
insert_instruction
(
std
::
next
(
ip
.
first
),
op
::
identity
{},
args
);
}
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/shape.cpp
View file @
eb0d8fee
...
...
@@ -19,7 +19,7 @@ struct shape_impl
shape_impl
()
:
m_type
(
shape
::
float_type
),
m_standard
(
false
)
{}
shape_impl
(
shape
::
type_t
t
)
:
m_type
(
t
),
m_lens
({
1
}),
m_strides
({
1
}),
m_standard
(
true
)
{}
shape_impl
(
shape
::
type_t
t
)
:
m_type
(
t
),
m_lens
({
1
}),
m_strides
({
0
}),
m_standard
(
true
)
{}
shape_impl
(
shape
::
type_t
t
,
std
::
vector
<
std
::
size_t
>
l
)
:
m_type
(
t
),
m_lens
(
std
::
move
(
l
)),
m_standard
(
true
)
{
...
...
src/simplify_algebra.cpp
View file @
eb0d8fee
#include <migraphx/simplify_algebra.hpp>
#include <migraphx/program.hpp>
#include <migraphx/op
erators
.hpp>
#include <migraphx/op
/add
.hpp>
#include <migraphx/matcher.hpp>
#include <migraphx/literal.hpp>
...
...
src/simplify_reshapes.cpp
View file @
eb0d8fee
#include <migraphx/simplify_reshapes.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/op
erators
.hpp>
#include <migraphx/op
/as_shape
.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/ranges.hpp>
#include <unordered_set>
...
...
@@ -14,7 +14,9 @@ bool is_reshaper(instruction_ref ins)
// clang-format off
static
const
std
::
unordered_set
<
std
::
string
>
names
=
{
"reshape"
,
"contiguous"
"contiguous"
,
"squeeze"
,
"unsqueeze"
};
// clang-format on
return
contains
(
names
,
ins
->
name
());
...
...
@@ -45,6 +47,9 @@ void simplify_reshapes::apply(program& p) const
auto
end
=
std
::
prev
(
p
.
end
());
for
(
auto
ins
:
iterator_for
(
p
))
{
if
(
ins
==
end
and
ins
->
name
()
==
"contiguous"
)
continue
;
// Skip possible dead instructions
if
(
ins
->
outputs
().
empty
()
and
ins
!=
end
)
continue
;
if
(
is_reshaper
(
ins
))
...
...
@@ -94,13 +99,6 @@ void simplify_reshapes::apply(program& p) const
p
.
replace_instruction
(
ins
,
t
->
inputs
().
front
());
}
}
// Replace all reshapes with as_shape
for
(
auto
ins
:
iterator_for
(
p
))
{
if
(
ins
->
name
()
!=
"reshape"
)
continue
;
p
.
replace_instruction
(
ins
,
op
::
as_shape
{
ins
->
get_shape
()},
ins
->
inputs
());
}
}
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/cpu/gemm.cpp
View file @
eb0d8fee
#include <migraphx/cpu/gemm.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/requires.hpp>
#include <migraphx/shape_for_each.hpp>
#include <blaze/math/CustomMatrix.h>
namespace
migraphx
{
...
...
@@ -14,10 +15,13 @@ template <class T>
static
auto
make_mat
(
tensor_view
<
T
>
x
)
{
const
auto
&
s
=
x
.
get_shape
();
assert
(
s
.
lens
().
size
()
==
2
);
// assert(s.lens().size() == 2);
std
::
size_t
n_dims
=
s
.
lens
().
size
();
std
::
size_t
dim_0
=
n_dims
-
2
;
std
::
size_t
dim_1
=
n_dims
-
1
;
if
(
s
.
transposed
())
return
matrix
<
T
>
{
x
.
data
(),
s
.
lens
()[
1
],
s
.
lens
()[
0
],
s
.
strides
()[
1
]};
return
matrix
<
T
>
{
x
.
data
(),
s
.
lens
()[
0
],
s
.
lens
()[
1
],
s
.
strides
()[
0
]};
return
matrix
<
T
>
{
x
.
data
(),
s
.
lens
()[
dim_
1
],
s
.
lens
()[
dim_
0
],
s
.
strides
()[
dim_
1
]};
return
matrix
<
T
>
{
x
.
data
(),
s
.
lens
()[
dim_
0
],
s
.
lens
()[
dim_
1
],
s
.
strides
()[
dim_
0
]};
}
template
<
class
T
,
class
F
>
...
...
@@ -51,7 +55,13 @@ void migemm_impl(tensor_view<T> cmat,
visit_mat
(
amat
,
[
&
](
const
auto
&
a
)
{
visit_mat
(
bmat
,
[
&
](
const
auto
&
b
)
{
auto
c
=
make_mat
(
cmat
);
c
=
(
a
*
b
)
*
alpha
+
beta
*
c
;
c
=
beta
*
c
;
// This is a simple optimization to avoid
// compute A * B if alpha is 0.0
if
(
alpha
!=
0.0
)
{
c
=
c
+
alpha
*
a
*
b
;
}
});
});
}
...
...
@@ -64,18 +74,24 @@ void migemm_impl(tensor_view<T> cmat,
float
beta
,
std
::
false_type
)
{
auto
m
=
cmat
.
get_shape
().
lens
()[
0
];
auto
n
=
cmat
.
get_shape
().
lens
()[
1
];
auto
k
=
amat
.
get_shape
().
lens
()[
1
];
std
::
size_t
n_dims
=
cmat
.
get_shape
().
lens
().
size
();
std
::
size_t
dim_0
=
n_dims
-
2
;
std
::
size_t
dim_1
=
n_dims
-
1
;
auto
k
=
amat
.
get_shape
().
lens
()[
dim_1
];
assert
(
amat
.
get_shape
().
lens
()[
1
]
==
bmat
.
get_shape
().
lens
()[
0
]);
assert
(
m
==
amat
.
get_shape
().
lens
()[
0
]);
assert
(
n
==
bmat
.
get_shape
().
lens
()[
1
]);
assert
(
amat
.
get_shape
().
lens
()[
dim_
1
]
==
bmat
.
get_shape
().
lens
()[
dim_
0
]);
assert
(
cmat
.
get_shape
().
lens
()[
dim_0
]
==
amat
.
get_shape
().
lens
()[
dim_
0
]);
assert
(
cmat
.
get_shape
().
lens
()[
dim_1
]
==
bmat
.
get_shape
().
lens
()[
dim_
1
]);
dfor
(
m
,
n
)([
&
](
auto
ii
,
auto
jj
)
{
double
s
=
cmat
(
ii
,
jj
)
*
beta
;
dfor
(
k
)([
&
](
auto
kk
)
{
s
+=
amat
(
ii
,
kk
)
*
bmat
(
kk
,
jj
);
});
cmat
(
ii
,
jj
)
=
alpha
*
s
;
shape_for_each
(
cmat
.
get_shape
(),
[
&
](
const
auto
&
c_idx
)
{
auto
a_idx
=
c_idx
;
auto
b_idx
=
c_idx
;
double
s
=
0.0
;
dfor
(
k
)([
&
](
auto
kk
)
{
a_idx
[
dim_1
]
=
b_idx
[
dim_0
]
=
kk
;
s
+=
amat
(
a_idx
.
begin
(),
a_idx
.
end
())
*
bmat
(
b_idx
.
begin
(),
b_idx
.
end
());
});
cmat
(
c_idx
.
begin
(),
c_idx
.
end
())
=
alpha
*
s
+
cmat
(
c_idx
.
begin
(),
c_idx
.
end
())
*
beta
;
});
}
...
...
@@ -83,7 +99,18 @@ template <class T>
void
migemm_impl
(
tensor_view
<
T
>
cmat
,
tensor_view
<
T
>
amat
,
tensor_view
<
T
>
bmat
,
float
alpha
,
float
beta
)
{
migemm_impl
(
cmat
,
amat
,
bmat
,
alpha
,
beta
,
is_fast_gemm_type
<
T
>
{});
auto
lens
=
amat
.
get_shape
().
lens
();
bool
batch_mul
=
std
::
accumulate
(
lens
.
rbegin
()
+
2
,
lens
.
rend
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
())
==
1
;
if
(
batch_mul
)
{
migemm_impl
(
cmat
,
amat
,
bmat
,
alpha
,
beta
,
is_fast_gemm_type
<
T
>
{});
}
else
{
migemm_impl
(
cmat
,
amat
,
bmat
,
alpha
,
beta
,
std
::
false_type
{});
}
}
void
migemm
(
...
...
src/targets/cpu/include/migraphx/cpu/target.hpp
View file @
eb0d8fee
...
...
@@ -7,6 +7,7 @@
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
struct
pass
;
namespace
cpu
{
struct
target
...
...
src/targets/cpu/lowering.cpp
View file @
eb0d8fee
...
...
@@ -48,6 +48,12 @@ struct cpu_batch_norm_inference
{
op
::
batch_norm_inference
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
std
::
string
name
()
const
{
return
"cpu::batch_norm_inference"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
...
...
@@ -75,10 +81,10 @@ struct cpu_batch_norm_inference
par_dfor
(
num_batch
,
num_channels
,
image_height
,
image_width
)(
[
&
](
std
::
size_t
n
,
std
::
size_t
c
,
std
::
size_t
h
,
std
::
size_t
w
)
{
assert
((
variance
(
c
)
+
epsilon
)
>
0
);
result
(
n
,
c
,
h
,
w
)
=
gamma
(
c
)
*
(
buffer
(
n
,
c
,
h
,
w
)
-
mean
(
c
)
)
/
std
::
sqrt
(
variance
(
c
)
+
epsilon
)
+
bias
(
c
)
;
assert
((
variance
[
c
]
+
epsilon
)
>
0
);
result
(
n
,
c
,
h
,
w
)
=
gamma
[
c
]
*
(
buffer
(
n
,
c
,
h
,
w
)
-
mean
[
c
]
)
/
std
::
sqrt
(
variance
[
c
]
+
epsilon
)
+
bias
[
c
]
;
});
});
}
...
...
@@ -107,6 +113,12 @@ struct cpu_lrn
{
op
::
lrn
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
std
::
string
name
()
const
{
return
"cpu::lrn"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
shape
output_shape
,
std
::
vector
<
argument
>
args
)
const
...
...
@@ -117,7 +129,7 @@ struct cpu_lrn
int
channels
=
output_shape
.
lens
()[
1
];
int
height
=
output_shape
.
lens
()[
2
];
int
width
=
output_shape
.
lens
()[
3
];
float
alphaoverarea
=
op
.
alpha
/
op
.
size
;
float
alphaoverarea
=
op
.
alpha
/
float
(
op
.
size
)
;
int
radius
=
(
op
.
size
-
1
)
/
2
;
par_dfor
(
n_batch
,
height
,
width
)([
&
](
int
b
,
int
h
,
int
w
)
{
...
...
@@ -144,6 +156,12 @@ struct cpu_convolution
{
op
::
convolution
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
std
::
string
name
()
const
{
return
"cpu::convolution"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
shape
output_shape
,
std
::
vector
<
argument
>
args
)
const
...
...
@@ -165,15 +183,15 @@ struct cpu_convolution
output_shape
.
lens
()[
2
],
output_shape
.
lens
()[
3
])(
[
&
](
std
::
size_t
o
,
std
::
size_t
w
,
std
::
size_t
i
,
std
::
size_t
j
)
{
const
int
start_x
=
i
*
op
.
stride
[
0
]
-
op
.
padding
[
0
];
const
int
start_y
=
j
*
op
.
stride
[
1
]
-
op
.
padding
[
1
];
const
int
group_id
=
w
/
(
wei_n
/
op
.
group
);
const
auto
start_x
=
i
*
op
.
stride
[
0
]
-
op
.
padding
[
0
];
const
auto
start_y
=
j
*
op
.
stride
[
1
]
-
op
.
padding
[
1
];
const
auto
group_id
=
w
/
(
wei_n
/
op
.
group
);
double
acc
=
0
;
dfor
(
wei_c
,
wei_h
,
wei_w
)([
&
](
std
::
size_t
k
,
std
::
size_t
x
,
std
::
size_t
y
)
{
const
int
in_x
=
start_x
+
x
;
const
int
in_y
=
start_y
+
y
;
const
int
in_ch
=
group_id
*
wei_c
+
k
;
const
auto
in_x
=
start_x
+
x
;
const
auto
in_y
=
start_y
+
y
;
const
auto
in_ch
=
group_id
*
wei_c
+
k
;
if
(
in_x
>=
0
&&
in_x
<
in_h
&&
in_y
>=
0
&&
in_y
<
in_w
)
{
acc
+=
input
(
o
,
in_ch
,
in_x
,
in_y
)
*
weights
(
w
,
k
,
x
,
y
);
...
...
@@ -190,6 +208,12 @@ struct cpu_im2col
{
op
::
im2col
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
static
std
::
string
name
()
{
return
"cpu::im2col"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
...
...
@@ -209,10 +233,8 @@ struct cpu_im2col
const
std
::
size_t
&
stride_h
=
op
.
stride
[
0
];
const
std
::
size_t
&
stride_w
=
op
.
stride
[
1
];
int
kdiv2_h
;
int
kdiv2_w
;
kdiv2_h
=
kernel_h
/
2
;
kdiv2_w
=
kernel_w
/
2
;
auto
kdiv2_h
=
kernel_h
/
2
;
auto
kdiv2_w
=
kernel_w
/
2
;
// calculate output sizes
const
std
::
size_t
col_height
=
(
height
-
kernel_h
+
2
*
pad_h
)
/
stride_h
+
1
;
const
std
::
size_t
col_width
=
(
width
-
kernel_w
+
2
*
pad_w
)
/
stride_w
+
1
;
...
...
@@ -230,8 +252,8 @@ struct cpu_im2col
dfor
(
channels
,
kernel_h
,
kernel_w
)([
&
](
std
::
size_t
c
,
std
::
size_t
koffset
,
std
::
size_t
loffset
)
{
int
idx
=
iinput
+
koffset
-
kdiv2_h
;
int
jdx
=
jinput
+
loffset
-
kdiv2_w
;
auto
idx
=
iinput
+
koffset
-
kdiv2_h
;
auto
jdx
=
jinput
+
loffset
-
kdiv2_w
;
col
(
ldx
,
p
)
=
((
idx
>=
0
)
&&
(
idx
<
height
)
&&
(
jdx
>=
0
)
&&
(
jdx
<
width
))
?
input
(
0
,
c
,
idx
,
jdx
)
:
0
;
...
...
@@ -273,6 +295,12 @@ struct cpu_pooling
{
op
::
pooling
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
std
::
string
name
()
const
{
return
"cpu::pooling_"
+
Op
::
name
();
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
...
...
@@ -317,20 +345,35 @@ struct cpu_pooling
}
};
struct
cpu_
contiguous
struct
cpu_
op
{
op
::
contiguous
op
;
std
::
string
name
()
const
{
return
"cpu::
contiguous"
;
}
op
eration
op
;
std
::
string
name
()
const
{
return
"cpu::
"
+
op
.
name
()
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
argument
compute
(
context
&
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
{
return
op
.
compute
(
output_shape
,
args
);
}
friend
bool
operator
==
(
const
cpu_op
&
x
,
const
cpu_op
&
y
)
{
return
x
.
op
==
y
.
op
;
}
friend
bool
operator
==
(
const
cpu_op
&
x
,
const
operation
&
y
)
{
return
op
.
compute
(
output_shape
,
std
::
move
(
args
));
if
(
x
.
name
()
!=
y
.
name
())
return
false
;
return
x
==
any_cast
<
cpu_op
>
(
y
);
}
friend
bool
operator
==
(
const
operation
&
x
,
const
cpu_op
&
y
)
{
return
y
==
x
;
}
};
struct
cpu_pad
{
op
::
pad
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
std
::
string
name
()
const
{
return
"cpu::contiguous"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
...
...
@@ -354,184 +397,54 @@ struct cpu_pad
}
};
struct
cpu_concat
{
op
::
concat
op
;
std
::
string
name
()
const
{
return
"cpu::concat"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
return
op
.
compute
(
output_shape
,
std
::
move
(
args
));
}
};
struct
cpu_gemm
{
op
::
dot
op
;
std
::
string
name
()
const
{
return
"cpu::dot"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
argument
result
{
output_shape
};
migemm
(
result
,
args
[
0
],
args
[
1
],
op
.
alpha
,
op
.
beta
);
return
result
;
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
};
struct
cpu_gather
{
op
::
gather
op
;
std
::
string
name
()
const
{
return
"cpu::gather"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
return
op
.
compute
(
output_shape
,
std
::
move
(
args
));
}
};
struct
identity_op
{
std
::
string
name
()
const
{
return
"cpu::identity"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
x
;
};
}
};
struct
abs_op
{
std
::
string
name
()
const
{
return
"cpu::abs"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
abs
(
make_signed
(
x
));
};
}
};
struct
exp_op
{
std
::
string
name
()
const
{
return
"cpu::exp"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
exp
(
x
);
};
}
};
struct
log_op
{
std
::
string
name
()
const
{
return
"cpu::log"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
log
(
x
);
};
}
};
struct
sin_op
{
std
::
string
name
()
const
{
return
"cpu::sin"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
sin
(
x
);
};
}
};
struct
cos_op
{
std
::
string
name
()
const
{
return
"cpu::cos"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
cos
(
x
);
};
}
};
struct
tan_op
{
std
::
string
name
()
const
{
return
"cpu::tan"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
tan
(
x
);
};
}
};
struct
asin_op
{
std
::
string
name
()
const
{
return
"cpu::asin"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
asin
(
x
);
};
}
};
struct
acos_op
{
std
::
string
name
()
const
{
return
"cpu::acos"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
acos
(
x
);
};
}
};
struct
atan_op
{
std
::
string
name
()
const
{
return
"cpu::atan"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
atan
(
x
);
};
}
};
struct
sinh_op
{
std
::
string
name
()
const
{
return
"cpu::sinh"
;
}
auto
fcn
()
const
std
::
string
name
()
const
{
return
"cpu::dot"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
[](
auto
x
)
{
return
std
::
sinh
(
x
);
};
if
(
inputs
.
size
()
==
3
)
{
auto
c_shape
=
inputs
.
at
(
2
);
check_shapes
{{
c_shape
}}.
not_broadcasted
();
}
return
op
.
compute_shape
(
inputs
);
}
};
struct
cosh_op
{
std
::
string
name
()
const
{
return
"cpu::cosh"
;
}
auto
fcn
()
const
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
return
[](
auto
x
)
{
return
std
::
cosh
(
x
);
};
}
};
argument
result
{
output_shape
};
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrics, and C is broadcastable to A * B
if
(
args
.
size
()
==
3
)
{
// no need to consider the value of args[2]
if
(
op
.
beta
==
0.0
f
)
{
result
.
visit
([
&
](
auto
output
)
{
std
::
fill
(
output
.
begin
(),
output
.
end
(),
0
);
});
}
else
{
visit_all
(
result
,
args
[
2
])([
&
](
auto
output
,
auto
input
)
{
std
::
copy
(
input
.
begin
(),
input
.
end
(),
output
.
begin
());
});
}
struct
tanh_op
{
std
::
string
name
()
const
{
return
"cpu::tanh"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
tanh
(
x
);
};
}
};
migemm
(
result
,
args
[
0
],
args
[
1
],
op
.
alpha
,
op
.
beta
);
struct
sigmoid_op
{
std
::
string
name
()
const
{
return
"cpu::sigmoid"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
1.
f
/
(
1.
f
+
std
::
exp
(
-
x
));
};
}
};
return
result
;
}
struct
neg_op
{
std
::
string
name
()
const
{
return
"cpu::neg"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
-
x
;
};
}
};
// 2 input arguments
migemm
(
result
,
args
[
0
],
args
[
1
],
op
.
alpha
,
0.0
f
);
struct
relu_op
{
std
::
string
name
()
const
{
return
"cpu::relu"
;
}
auto
fcn
()
const
{
return
[](
auto
x
)
{
return
std
::
max
(
decltype
(
x
){
0
},
x
);
};
return
result
;
}
};
...
...
@@ -561,16 +474,45 @@ template <typename Op>
struct
cpu_unary
{
Op
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
.
op
,
f
);
}
std
::
string
name
()
const
{
return
op
.
name
();
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
inputs
.
front
();
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
check_shapes
{
inputs
}.
has
(
1
);
auto
s
=
inputs
.
at
(
0
);
if
(
s
.
packed
())
{
return
s
;
}
else
{
return
{
s
.
type
(),
s
.
lens
()};
}
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
result
.
visit
([
&
](
auto
output
)
{
args
[
0
].
visit
([
&
](
auto
input
)
{
std
::
transform
(
input
.
begin
(),
input
.
end
(),
output
.
begin
(),
op
.
fcn
());
if
(
input
.
get_shape
().
standard
())
{
std
::
transform
(
input
.
begin
(),
input
.
end
(),
output
.
begin
(),
op
.
fcn
());
}
else
{
shape_for_each
(
output
.
get_shape
(),
[
&
](
const
auto
&
idx
)
{
output
(
idx
.
begin
(),
idx
.
end
())
=
op
.
fcn
()(
input
(
idx
.
begin
(),
idx
.
end
()));
});
}
});
});
return
result
;
}
};
...
...
@@ -590,20 +532,20 @@ struct softmax2d
auto
nw
=
input
.
get_shape
().
lens
()[
3
];
dfor
(
nb
,
nh
,
nw
)([
&
](
std
::
size_t
b
,
std
::
size_t
i
,
std
::
size_t
j
)
{
value_type
cmax
=
std
::
numeric_limits
<
value_type
>::
lowest
();
for
(
in
t
c
=
0
;
c
<
nc
;
c
++
)
for
(
std
::
size_
t
c
=
0
;
c
<
nc
;
c
++
)
{
cmax
=
std
::
max
(
cmax
,
input
(
b
,
c
,
i
,
j
));
}
for
(
in
t
c
=
0
;
c
<
nc
;
c
++
)
for
(
std
::
size_
t
c
=
0
;
c
<
nc
;
c
++
)
{
output
(
b
,
c
,
i
,
j
)
=
std
::
exp
(
input
(
b
,
c
,
i
,
j
)
-
cmax
);
}
value_type
sum
=
value_type
(
0
);
for
(
in
t
c
=
0
;
c
<
nc
;
c
++
)
for
(
std
::
size_
t
c
=
0
;
c
<
nc
;
c
++
)
{
sum
+=
output
(
b
,
c
,
i
,
j
);
}
for
(
in
t
c
=
0
;
c
<
nc
;
c
++
)
for
(
std
::
size_
t
c
=
0
;
c
<
nc
;
c
++
)
{
output
(
b
,
c
,
i
,
j
)
=
output
(
b
,
c
,
i
,
j
)
/
sum
;
}
...
...
@@ -613,83 +555,78 @@ struct softmax2d
}
};
struct
add_op
{
std
::
string
name
()
const
{
return
"add"
;
}
auto
fcn
()
const
{
return
[](
auto
x
,
auto
y
)
{
return
x
+
y
;
};
}
};
struct
sub_op
{
std
::
string
name
()
const
{
return
"sub"
;
}
auto
fcn
()
const
{
return
[](
auto
x
,
auto
y
)
{
return
x
-
y
;
};
}
};
struct
mul_op
struct
cpu_logsoftmax
{
std
::
string
name
()
const
{
return
"mul"
;
}
auto
fcn
()
const
{
return
[](
auto
x
,
auto
y
)
{
return
x
*
y
;
};
}
};
op
::
logsoftmax
op
;
struct
div_op
{
std
::
string
name
()
const
{
return
"div"
;
}
auto
fcn
()
const
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
[](
auto
x
,
auto
y
)
{
return
x
/
y
;
}
;
return
migraphx
::
reflect
(
self
.
op
,
f
)
;
}
};
struct
max_op
{
std
::
string
name
()
const
{
return
"max"
;
}
auto
fcn
()
const
{
return
[](
auto
x
,
auto
y
)
{
return
std
::
max
(
x
,
y
);
};
}
};
std
::
string
name
()
const
{
return
"cpu::logsoftmax"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
struct
min_op
{
std
::
string
name
()
const
{
return
"min"
;
}
auto
fcn
()
const
template
<
typename
T
>
std
::
size_t
compute_batch_index
(
const
T
&
idx
,
shape
&
batch_shape
,
int
axis
)
const
{
return
[](
auto
x
,
auto
y
)
{
return
std
::
min
(
x
,
y
);
};
if
(
axis
==
0
)
{
return
0
;
}
else
{
std
::
vector
<
std
::
size_t
>
batch_idx
(
idx
.
begin
(),
idx
.
begin
()
+
axis
);
return
batch_shape
.
index
(
batch_idx
.
begin
(),
batch_idx
.
end
());
}
}
};
template
<
typename
Op
>
struct
cpu_binary
{
Op
op
;
std
::
string
name
()
const
{
return
op
.
name
();
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
inputs
.
front
();
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
visit_all
(
result
,
args
[
0
],
args
[
1
])([
&
](
auto
output
,
auto
input1
,
auto
input2
)
{
if
(
input1
.
get_shape
().
packed
()
and
input2
.
get_shape
().
packed
())
{
std
::
transform
(
input1
.
begin
(),
input1
.
end
(),
input2
.
begin
(),
output
.
begin
(),
op
.
fcn
());
}
else
auto
lens
=
output_shape
.
lens
();
std
::
vector
<
std
::
size_t
>
batch_lens
{};
if
(
op
.
axis
==
0
)
{
batch_lens
.
push_back
(
1
);
}
else
{
batch_lens
.
insert
(
batch_lens
.
begin
(),
lens
.
begin
(),
lens
.
begin
()
+
op
.
axis
);
}
shape
batch_shape
{
migraphx
::
shape
::
uint32_type
,
batch_lens
};
visit_all
(
result
,
args
[
0
])([
&
](
auto
output
,
auto
input
)
{
using
value_type
=
typename
decltype
(
input
)
::
value_type
;
std
::
vector
<
value_type
>
batch_max
(
batch_shape
.
elements
(),
std
::
numeric_limits
<
value_type
>::
lowest
());
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_max
[
index
]
=
std
::
max
(
batch_max
[
index
],
input
(
idx
.
begin
(),
idx
.
end
()));
});
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
=
input
(
idx
.
begin
(),
idx
.
end
())
-
batch_max
[
index
];
});
std
::
vector
<
value_type
>
batch_sum
(
batch_shape
.
elements
(),
value_type
(
0
));
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
batch_sum
[
index
]
+=
std
::
exp
(
output
(
idx
.
begin
(),
idx
.
end
()));
});
for
(
std
::
size_t
i
=
0
;
i
<
batch_sum
.
size
();
++
i
)
{
shape_for_each
(
output
.
get_shape
(),
[
&
](
const
auto
&
idx
)
{
output
(
idx
.
begin
(),
idx
.
end
())
=
op
.
fcn
()(
input1
(
idx
.
begin
(),
idx
.
end
()),
input2
(
idx
.
begin
(),
idx
.
end
()));
});
batch_sum
[
i
]
=
std
::
log
(
batch_sum
[
i
]);
}
shape_for_each
(
output_shape
,
[
&
](
auto
idx
)
{
auto
index
=
this
->
compute_batch_index
(
idx
,
batch_shape
,
op
.
axis
);
output
(
idx
.
begin
(),
idx
.
end
())
-=
batch_sum
[
index
];
});
});
return
result
;
}
};
...
...
@@ -713,42 +650,17 @@ struct cpu_apply
void
init
()
{
apply_map
[
"im2col"
]
=
extend_op
<
cpu_im2col
,
op
::
im2col
>
();
apply_map
[
"convolution"
]
=
extend_op
<
cpu_convolution
,
op
::
convolution
>
();
apply_map
[
"dot"
]
=
extend_op
<
cpu_gemm
,
op
::
dot
>
();
apply_map
[
"batch_norm_inference"
]
=
extend_op
<
cpu_batch_norm_inference
,
op
::
batch_norm_inference
>
();
apply_map
[
"lrn"
]
=
extend_op
<
cpu_lrn
,
op
::
lrn
>
();
apply_map
[
"contiguous"
]
=
extend_op
<
cpu_contiguous
,
op
::
contiguous
>
();
apply_map
[
"pad"
]
=
extend_op
<
cpu_pad
,
op
::
pad
>
();
apply_map
[
"concat"
]
=
extend_op
<
cpu_concat
,
op
::
concat
>
();
apply_map
[
"gather"
]
=
extend_op
<
cpu_gather
,
op
::
gather
>
();
apply_map
[
"leaky_relu"
]
=
extend_op
<
cpu_unary
<
leaky_relu_op
>
,
op
::
leaky_relu
>
();
apply_map
[
"elu"
]
=
extend_op
<
cpu_unary
<
elu_op
>
,
op
::
elu
>
();
apply_map
[
"identity"
]
=
simple_op
<
cpu_unary
<
identity_op
>>
();
apply_map
[
"abs"
]
=
simple_op
<
cpu_unary
<
abs_op
>>
();
apply_map
[
"sinh"
]
=
simple_op
<
cpu_unary
<
sinh_op
>>
();
apply_map
[
"cosh"
]
=
simple_op
<
cpu_unary
<
cosh_op
>>
();
apply_map
[
"tanh"
]
=
simple_op
<
cpu_unary
<
tanh_op
>>
();
apply_map
[
"sigmoid"
]
=
simple_op
<
cpu_unary
<
sigmoid_op
>>
();
apply_map
[
"exp"
]
=
simple_op
<
cpu_unary
<
exp_op
>>
();
apply_map
[
"log"
]
=
simple_op
<
cpu_unary
<
log_op
>>
();
apply_map
[
"neg"
]
=
simple_op
<
cpu_unary
<
neg_op
>>
();
apply_map
[
"sin"
]
=
simple_op
<
cpu_unary
<
sin_op
>>
();
apply_map
[
"cos"
]
=
simple_op
<
cpu_unary
<
cos_op
>>
();
apply_map
[
"tan"
]
=
simple_op
<
cpu_unary
<
tan_op
>>
();
apply_map
[
"asin"
]
=
simple_op
<
cpu_unary
<
asin_op
>>
();
apply_map
[
"acos"
]
=
simple_op
<
cpu_unary
<
acos_op
>>
();
apply_map
[
"atan"
]
=
simple_op
<
cpu_unary
<
atan_op
>>
();
apply_map
[
"relu"
]
=
simple_op
<
cpu_unary
<
relu_op
>>
();
apply_map
[
"add"
]
=
simple_op
<
cpu_binary
<
add_op
>>
();
apply_map
[
"sub"
]
=
simple_op
<
cpu_binary
<
sub_op
>>
();
apply_map
[
"mul"
]
=
simple_op
<
cpu_binary
<
mul_op
>>
();
apply_map
[
"div"
]
=
simple_op
<
cpu_binary
<
div_op
>>
();
apply_map
[
"max"
]
=
simple_op
<
cpu_binary
<
max_op
>>
();
apply_map
[
"min"
]
=
simple_op
<
cpu_binary
<
min_op
>>
();
apply_map
[
"softmax"
]
=
simple_op
<
softmax2d
>
();
apply_map
[
"convolution"
]
=
extend_op
<
cpu_convolution
,
op
::
convolution
>
();
apply_map
[
"dot"
]
=
extend_op
<
cpu_gemm
,
op
::
dot
>
();
apply_map
[
"elu"
]
=
extend_op
<
cpu_unary
<
elu_op
>
,
op
::
elu
>
();
apply_map
[
"im2col"
]
=
extend_op
<
cpu_im2col
,
op
::
im2col
>
();
apply_map
[
"leaky_relu"
]
=
extend_op
<
cpu_unary
<
leaky_relu_op
>
,
op
::
leaky_relu
>
();
apply_map
[
"logsoftmax"
]
=
extend_op
<
cpu_logsoftmax
,
op
::
logsoftmax
>
();
apply_map
[
"lrn"
]
=
extend_op
<
cpu_lrn
,
op
::
lrn
>
();
apply_map
[
"pad"
]
=
extend_op
<
cpu_pad
,
op
::
pad
>
();
apply_map
[
"softmax"
]
=
simple_op
<
softmax2d
>
();
}
void
apply
()
...
...
@@ -764,9 +676,18 @@ struct cpu_apply
{
apply_map
.
at
(
it
->
name
())(
it
);
}
else
if
(
is_context_free
(
it
->
get_operator
()))
{
apply_cpu_op
(
it
);
}
}
}
void
apply_cpu_op
(
instruction_ref
ins
)
{
prog
->
replace_instruction
(
ins
,
cpu_op
{
ins
->
get_operator
()},
ins
->
inputs
());
}
template
<
class
T
>
void
apply_simple_op
(
instruction_ref
ins
)
{
...
...
src/targets/cpu/target.cpp
View file @
eb0d8fee
#include <migraphx/cpu/target.hpp>
#include <migraphx/cpu/lowering.hpp>
#include <migraphx/pass.hpp>
#include <migraphx/auto_contiguous.hpp>
#include <migraphx/rewrite_rnn.hpp>
#include <migraphx/dead_code_elimination.hpp>
...
...
@@ -13,8 +14,9 @@ std::string target::name() const { return "cpu"; }
std
::
vector
<
pass
>
target
::
get_passes
(
migraphx
::
context
&
)
const
{
return
{
auto_contiguous
{},
rewrite_rnn
{},
return
{
rewrite_rnn
{},
dead_code_elimination
{},
auto_contiguous
{},
dead_code_elimination
{},
lowering
{},
dead_code_elimination
{}};
...
...
src/targets/gpu/CMakeLists.txt
View file @
eb0d8fee
...
...
@@ -26,11 +26,14 @@ add_library(migraphx_device
device/atan.cpp
device/add_relu.cpp
device/contiguous.cpp
device/logsoftmax.cpp
device/convert.cpp
device/mul.cpp
device/concat.cpp
device/pad.cpp
device/gather.cpp
device/sub.cpp
device/clip.cpp
)
set_target_properties
(
migraphx_device PROPERTIES EXPORT_NAME device
)
rocm_clang_tidy_check
(
migraphx_device
)
...
...
@@ -48,6 +51,7 @@ add_library(migraphx_gpu
pooling.cpp
convolution.cpp
softmax.cpp
logsoftmax.cpp
contiguous.cpp
concat.cpp
relu.cpp
...
...
@@ -62,6 +66,9 @@ add_library(migraphx_gpu
pad.cpp
gather.cpp
lrn.cpp
schedule_model.cpp
adjust_allocation.cpp
clip.cpp
)
set_target_properties
(
migraphx_gpu PROPERTIES EXPORT_NAME gpu
)
rocm_clang_tidy_check
(
migraphx_gpu
)
...
...
src/targets/gpu/abs.cpp
View file @
eb0d8fee
#include <migraphx/gpu/abs.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -10,8 +7,8 @@ namespace gpu {
shape
miopen_abs
::
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
check_shapes
{
inputs
,
*
this
}.
has
(
2
).
not_broadcast
ed
();
return
inputs
.
at
(
1
);
check_shapes
{
inputs
,
*
this
}.
has
(
2
).
pack
ed
();
return
inputs
.
at
(
0
);
}
argument
miopen_abs
::
compute
(
context
&
ctx
,
...
...
src/targets/gpu/adjust_allocation.cpp
0 → 100644
View file @
eb0d8fee
#include <migraphx/gpu/adjust_allocation.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/program.hpp>
#include <migraphx/iterator_for.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
void
adjust_allocation
::
apply
(
program
&
p
)
const
{
for
(
auto
ins
:
iterator_for
(
p
))
{
// skip instruction with no input
if
(
ins
->
inputs
().
empty
())
continue
;
if
(
ins
->
name
()
==
"load"
)
continue
;
auto
alias_ins
=
instruction
::
get_output_alias
(
ins
,
true
);
if
(
alias_ins
->
name
()
==
"hip::allocate"
)
{
// shape allocated is different from actual shape
// of the instruction, reallocate and replace the previous one
if
(
alias_ins
->
get_shape
()
!=
ins
->
get_shape
())
{
auto
alloc_ins
=
p
.
insert_instruction
(
ins
,
hip_allocate
{
ins
->
get_shape
()});
p
.
replace_instruction
(
alias_ins
,
alloc_ins
);
}
}
}
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/batchnorm.cpp
View file @
eb0d8fee
#include <migraphx/gpu/batchnorm.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
src/targets/gpu/clip.cpp
0 → 100644
View file @
eb0d8fee
#include <migraphx/gpu/clip.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/clip.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
shape
hip_clip
::
compute_shape
(
std
::
vector
<
shape
>
inputs
)
const
{
inputs
.
pop_back
();
return
op
.
compute_shape
(
inputs
);
}
argument
hip_clip
::
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
device
::
clip
(
ctx
.
get_stream
().
get
(),
args
.
back
(),
args
.
front
(),
op
.
max_val
,
op
.
min_val
);
return
args
.
back
();
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/concat.cpp
View file @
eb0d8fee
#include <migraphx/gpu/concat.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/concat.hpp>
#include <utility>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
src/targets/gpu/contiguous.cpp
View file @
eb0d8fee
#include <migraphx/gpu/contiguous.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/contiguous.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
src/targets/gpu/convolution.cpp
View file @
eb0d8fee
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <utility>
#include <migraphx/gpu/context.hpp>
#include <migraphx/generate.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -23,19 +21,21 @@ argument miopen_convolution::compute(context& ctx,
float
alpha
=
1
;
float
beta
=
0
;
miopenConvolutionForward
(
ctx
.
get_stream
().
get_miopen
(),
&
alpha
,
x_desc
.
get
(),
args
[
0
].
implicit
(),
w_desc
.
get
(),
args
[
1
].
implicit
(),
cd
.
get
(),
algo
,
&
beta
,
y_desc
.
get
(),
args
[
3
].
implicit
(),
args
[
2
].
implicit
(),
args
[
2
].
get_shape
().
bytes
());
auto
status
=
miopenConvolutionForward
(
ctx
.
get_stream
().
get_miopen
(),
&
alpha
,
x_desc
.
get
(),
args
[
0
].
implicit
(),
w_desc
.
get
(),
args
[
1
].
implicit
(),
cd
.
get
(),
algo
,
&
beta
,
y_desc
.
get
(),
args
[
3
].
implicit
(),
args
[
2
].
implicit
(),
args
[
2
].
get_shape
().
bytes
());
if
(
status
!=
miopenStatusSuccess
)
MIGRAPHX_THROW
(
"Running convolution failed"
);
return
args
[
3
];
}
...
...
@@ -91,8 +91,11 @@ void miopen_convolution::finalize(context& ctx,
{
if
(
handle
==
ctx
.
get_stream
().
get_miopen
())
return
;
// TODO: Check that workspace hasn't changed
compile
(
ctx
,
output_shape
,
std
::
move
(
inputs
));
// Check that workspace hasn't changed
auto
size
=
inputs
.
at
(
2
).
bytes
();
auto
ws
=
compile
(
ctx
,
output_shape
,
std
::
move
(
inputs
));
if
(
ws
.
bytes
()
>
size
)
MIGRAPHX_THROW
(
"Workspace has changed during finalization."
);
}
}
// namespace gpu
...
...
src/targets/gpu/device/clip.cpp
0 → 100644
View file @
eb0d8fee
#include <migraphx/gpu/device/clip.hpp>
#include <migraphx/gpu/device/nary.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
clip
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
float
max
,
const
float
min
)
{
nary
(
stream
,
result
,
arg1
)(
[
max
,
min
](
auto
x
)
{
return
std
::
min
<
decltype
(
x
)
>
(
std
::
max
<
decltype
(
x
)
>
(
min
,
x
),
max
);
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
Prev
1
…
3
4
5
6
7
8
9
10
11
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment