Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
98fd5e1d
Commit
98fd5e1d
authored
Aug 27, 2019
by
Paul
Browse files
Merge branch 'develop' into eliminate-more-contiguous
parents
f7a6d87f
a1c7e7a5
Changes
76
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
522 additions
and
109 deletions
+522
-109
src/include/migraphx/verify.hpp
src/include/migraphx/verify.hpp
+1
-0
src/onnx/onnx.cpp
src/onnx/onnx.cpp
+49
-18
src/opt/memory_coloring_impl.cpp
src/opt/memory_coloring_impl.cpp
+3
-0
src/opt/memory_coloring_impl.hpp
src/opt/memory_coloring_impl.hpp
+1
-1
src/program.cpp
src/program.cpp
+1
-1
src/py/migraphx_py.cpp
src/py/migraphx_py.cpp
+1
-0
src/quantization.cpp
src/quantization.cpp
+127
-27
src/rewrite_rnn.cpp
src/rewrite_rnn.cpp
+0
-1
src/simplify_algebra.cpp
src/simplify_algebra.cpp
+7
-5
src/targets/cpu/gemm.cpp
src/targets/cpu/gemm.cpp
+26
-19
src/targets/cpu/include/migraphx/cpu/gemm.hpp
src/targets/cpu/include/migraphx/cpu/gemm.hpp
+5
-0
src/targets/cpu/lowering.cpp
src/targets/cpu/lowering.cpp
+143
-18
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+9
-1
src/targets/gpu/convert.cpp
src/targets/gpu/convert.cpp
+24
-0
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
...targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+7
-18
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
...targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+1
-0
src/targets/gpu/device/int8_gemm_pack.cpp
src/targets/gpu/device/int8_gemm_pack.cpp
+77
-0
src/targets/gpu/device/round.cpp
src/targets/gpu/device/round.cpp
+18
-0
src/targets/gpu/device/sigmoid.cpp
src/targets/gpu/device/sigmoid.cpp
+18
-0
src/targets/gpu/gemm.cpp
src/targets/gpu/gemm.cpp
+4
-0
No files found.
src/include/migraphx/verify.hpp
View file @
98fd5e1d
...
...
@@ -168,6 +168,7 @@ bool verify_range(R1&& r1, R2&& r2, double tolerance = 80, double* out_error = n
{
double
threshold
=
std
::
numeric_limits
<
range_value
<
R1
>>::
epsilon
()
*
tolerance
;
auto
error
=
rms_range
(
r1
,
r2
);
// cppcheck-suppress uninitvar
if
(
out_error
!=
nullptr
)
*
out_error
=
error
;
return
error
<=
threshold
;
...
...
src/onnx/onnx.cpp
View file @
98fd5e1d
...
...
@@ -55,6 +55,7 @@ struct onnx_parser
add_generic_op
(
"Acos"
,
op
::
acos
{});
add_generic_op
(
"Atan"
,
op
::
atan
{});
add_generic_op
(
"Sqrt"
,
op
::
sqrt
{});
add_generic_op
(
"Round"
,
op
::
round
{});
add_generic_op
(
"Sign"
,
op
::
sign
{});
add_binary_op
(
"Add"
,
op
::
add
{});
...
...
@@ -206,6 +207,16 @@ struct onnx_parser
return
out_lens
;
}
instruction_ref
make_contiguous
(
instruction_ref
ins
)
{
if
(
ins
->
get_shape
().
standard
())
{
return
ins
;
}
return
prog
.
add_instruction
(
op
::
contiguous
{},
ins
);
}
template
<
class
T
>
instruction_ref
add_broadcastable_binary_op
(
instruction_ref
arg0
,
instruction_ref
arg1
,
T
x
)
{
...
...
@@ -313,7 +324,11 @@ struct onnx_parser
{
if
(
contains
(
attributes
,
"auto_pad"
))
{
MIGRAPHX_THROW
(
"auto_pad and padding cannot be specified simultaneously"
);
auto
s
=
attributes
[
"auto_pad"
].
s
();
if
(
contains
(
attributes
,
"pads"
)
and
to_upper
(
s
)
!=
"NOTSET"
)
{
MIGRAPHX_THROW
(
"auto_pad and padding cannot be specified simultaneously"
);
}
}
std
::
vector
<
std
::
int64_t
>
padding
;
copy
(
attributes
[
"pads"
].
ints
(),
std
::
back_inserter
(
padding
));
...
...
@@ -361,7 +376,7 @@ struct onnx_parser
if
(
args
.
size
()
==
3
)
{
uint64_t
axis
=
1
;
auto
l1
=
prog
.
add_instruction
(
op
,
args
[
0
]
,
args
[
1
]);
auto
l1
=
prog
.
add_instruction
(
op
,
l0
,
args
[
1
]);
auto
l2
=
prog
.
add_instruction
(
op
::
broadcast
{
axis
,
l1
->
get_shape
().
lens
()},
args
[
2
]);
return
prog
.
add_instruction
(
op
::
add
{},
l1
,
l2
);
}
...
...
@@ -437,12 +452,7 @@ struct onnx_parser
s
.
visit
([
&
](
auto
v
)
{
copy
(
v
,
std
::
back_inserter
(
op
.
dims
));
});
}
if
(
!
args
[
0
]
->
get_shape
().
standard
())
{
args
[
0
]
=
prog
.
add_instruction
(
op
::
contiguous
{},
args
[
0
]);
}
return
prog
.
add_instruction
(
op
,
args
[
0
]);
return
prog
.
add_instruction
(
op
,
make_contiguous
(
args
[
0
]));
}
instruction_ref
...
...
@@ -490,23 +500,41 @@ struct onnx_parser
{
axis
=
parse_value
(
attributes
.
at
(
"axis"
)).
at
<
int
>
();
}
op
::
gather
op
{
axis
};
return
prog
.
add_instruction
(
op
,
std
::
move
(
args
));
return
prog
.
add_instruction
(
op
,
make_contiguous
(
args
[
0
]),
make_contiguous
(
args
[
1
]
));
}
instruction_ref
parse_slice
(
const
std
::
string
&
,
attribute_map
attributes
,
std
::
vector
<
instruction_ref
>
args
)
{
op
::
slice
op
;
std
::
vector
<
size_t
>
dims
=
args
[
0
]
->
get_shape
().
lens
();
size_t
num_dims
=
dims
.
size
();
if
(
contains
(
attributes
,
"axes"
))
{
literal
s
=
parse_value
(
attributes
.
at
(
"axes"
));
s
.
visit
([
&
](
auto
v
)
{
copy
(
v
,
std
::
back_inserter
(
op
.
axes
));
});
}
else
{
op
.
axes
=
std
::
vector
<
int64_t
>
(
num_dims
);
std
::
iota
(
op
.
axes
.
begin
(),
op
.
axes
.
end
(),
0
);
}
if
(
contains
(
attributes
,
"ends"
))
{
literal
s
=
parse_value
(
attributes
.
at
(
"ends"
));
s
.
visit
([
&
](
auto
v
)
{
copy
(
v
,
std
::
back_inserter
(
op
.
ends
));
});
for
(
size_t
i
=
0
;
i
<
num_dims
;
i
++
)
{
if
(
static_cast
<
size_t
>
(
op
.
ends
[
i
])
>
dims
[
i
])
{
op
.
ends
[
i
]
=
dims
[
i
];
}
}
}
if
(
contains
(
attributes
,
"starts"
))
{
literal
s
=
parse_value
(
attributes
.
at
(
"starts"
));
s
.
visit
([
&
](
auto
v
)
{
copy
(
v
,
std
::
back_inserter
(
op
.
starts
));
});
...
...
@@ -1011,9 +1039,10 @@ struct onnx_parser
}
std
::
vector
<
operation
>
vec_actv_funcs
(
vec_names
.
size
());
std
::
transform
(
vec_names
.
begin
(),
vec_names
.
end
(),
vec_actv_funcs
.
begin
(),
[
&
](
auto
&
fn
)
{
return
map_actv_funcs
[
fn
];
});
std
::
transform
(
vec_names
.
begin
(),
vec_names
.
end
(),
vec_actv_funcs
.
begin
(),
[
&
](
const
auto
&
fn
)
{
return
map_actv_funcs
[
fn
];
});
// To be added later
float
clip
=
0.0
;
...
...
@@ -1127,9 +1156,10 @@ struct onnx_parser
}
std
::
vector
<
operation
>
vec_actv_funcs
(
vec_names
.
size
());
std
::
transform
(
vec_names
.
begin
(),
vec_names
.
end
(),
vec_actv_funcs
.
begin
(),
[
&
](
auto
&
name
)
{
return
map_actv_funcs
[
name
];
});
std
::
transform
(
vec_names
.
begin
(),
vec_names
.
end
(),
vec_actv_funcs
.
begin
(),
[
&
](
const
auto
&
name
)
{
return
map_actv_funcs
[
name
];
});
float
clip
=
0.0
;
if
(
contains
(
attributes
,
"clip"
))
...
...
@@ -1299,9 +1329,10 @@ struct onnx_parser
}
std
::
vector
<
operation
>
vec_actv_funcs
(
vec_names
.
size
());
std
::
transform
(
vec_names
.
begin
(),
vec_names
.
end
(),
vec_actv_funcs
.
begin
(),
[
&
](
auto
&
name
)
{
return
map_actv_funcs
[
name
];
});
std
::
transform
(
vec_names
.
begin
(),
vec_names
.
end
(),
vec_actv_funcs
.
begin
(),
[
&
](
const
auto
&
name
)
{
return
map_actv_funcs
[
name
];
});
float
clip
=
0.0
;
if
(
contains
(
attributes
,
"clip"
))
...
...
src/opt/memory_coloring_impl.cpp
View file @
98fd5e1d
...
...
@@ -85,6 +85,9 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
offset
+=
(
element_size
-
(
offset
%
element_size
));
conflict_queue
.
pop
();
}
// when int8 type is used, the offset could be any number
// if not 4-byte aligned, miopen int8 convolution can crash
offset
=
(
offset
+
3
)
/
4
*
4
;
segment
.
offset
=
offset
;
MIGRAPHX_DEBUG
(
segment
.
dump
());
required_bytes
=
std
::
max
(
required_bytes
,
offset
+
segment
.
size
);
...
...
src/opt/memory_coloring_impl.hpp
View file @
98fd5e1d
...
...
@@ -107,7 +107,7 @@ struct memory_coloring_impl
return
ins
->
name
()
==
"check_context"
;
}
static
bool
is_disjoin
(
live_range
&
range1
,
live_range
&
range2
)
static
bool
is_disjoin
(
const
live_range
&
range1
,
const
live_range
&
range2
)
{
if
((
range1
.
size
==
0
)
||
(
range2
.
size
==
0
))
return
false
;
...
...
src/program.cpp
View file @
98fd5e1d
...
...
@@ -241,7 +241,7 @@ instruction_ref program::remove_instructions(instruction_ref first, instruction_
// TODO: Check every element
assert
(
has_instruction
(
first
));
std
::
for_each
(
first
,
last
,
[
&
](
instruction
&
ins
)
{
ins
.
clear_arguments
();
});
assert
(
std
::
all_of
(
first
,
last
,
[
&
](
instruction
&
ins
)
{
return
ins
.
outputs
().
empty
();
}));
assert
(
std
::
all_of
(
first
,
last
,
[
&
](
const
instruction
&
ins
)
{
return
ins
.
outputs
().
empty
();
}));
return
impl
->
instructions
.
erase
(
first
,
last
);
}
...
...
src/py/migraphx_py.cpp
View file @
98fd5e1d
...
...
@@ -156,6 +156,7 @@ PYBIND11_MODULE(migraphx, m)
py
::
class_
<
migraphx
::
target
>
(
m
,
"target"
);
py
::
class_
<
migraphx
::
program
>
(
m
,
"program"
)
.
def
(
"clone"
,
[](
migraphx
::
program
&
p
)
{
return
*
(
new
migraphx
::
program
(
p
));
})
.
def
(
"get_parameter_shapes"
,
&
migraphx
::
program
::
get_parameter_shapes
)
.
def
(
"get_shape"
,
&
migraphx
::
program
::
get_shape
)
.
def
(
"compile"
,
[](
migraphx
::
program
&
p
,
const
migraphx
::
target
&
t
)
{
p
.
compile
(
t
);
})
...
...
src/quantization.cpp
View file @
98fd5e1d
...
...
@@ -3,32 +3,53 @@
#include <migraphx/instruction.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/op/convert.hpp>
#include <migraphx/op/dot.hpp>
#include <migraphx/op/mul.hpp>
#include <migraphx/op/add.hpp>
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/op/capture.hpp>
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/op/multibroadcast.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/ranges.hpp>
#include <utility>
#include <iomanip>
#include <fstream>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
instruction_ref
insert_
fp16
(
program
&
prog
,
instruction_ref
&
ins
,
shape
::
type_t
type
,
std
::
unordered_map
<
instruction_ref
,
instruction_ref
>&
map_
fp16
)
instruction_ref
insert_
quant_ins
(
program
&
prog
,
instruction_ref
&
ins
,
shape
::
type_t
type
,
std
::
unordered_map
<
instruction_ref
,
instruction_ref
>&
map_
ins
)
{
if
(
map_
fp16
.
count
(
ins
)
>
0
)
if
(
map_
ins
.
count
(
ins
)
>
0
)
{
return
map_fp16
[
ins
];
return
map_ins
[
ins
];
}
if
(
ins
->
name
()
==
"undefined"
)
{
return
ins
;
}
assert
(
ins
->
get_shape
().
type
()
==
shape
::
float_type
||
ins
->
get_shape
().
type
()
==
shape
::
double_type
);
instruction_ref
ins_fp16
{};
ins_fp16
=
prog
.
insert_instruction
(
std
::
next
(
ins
),
op
::
convert
{
type
},
ins
);
map_fp16
[
ins
]
=
ins_fp16
;
ins
->
get_shape
().
type
()
==
shape
::
double_type
||
ins
->
get_shape
().
type
()
==
shape
::
int32_type
);
instruction_ref
quant_ins
{};
quant_ins
=
prog
.
insert_instruction
(
std
::
next
(
ins
),
op
::
convert
{
type
},
ins
);
map_ins
[
ins
]
=
quant_ins
;
return
ins_fp16
;
return
quant_ins
;
}
// This function is to convert any instructions specified in the input
// from double or float to float16 by inserting a convert operator.
// For the conversion, there could be cases of overflowing, but it
// is very rare in the area of deeping learning, so we just do a
// truncate of the input to get the fp16.
void
quantize
(
program
&
prog
,
const
std
::
vector
<
std
::
string
>&
ins_names
)
{
std
::
unordered_map
<
instruction_ref
,
instruction_ref
>
map_fp16
;
...
...
@@ -53,13 +74,14 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
// if the input is a convert operator, uses its input
// as its current input
instruction_ref
input_fp16
{};
if
(
input
->
name
()
==
"convert"
)
if
(
input
->
name
()
==
"convert"
and
input
->
inputs
().
front
()
->
get_shape
().
type
()
==
shape
::
half_type
)
{
input_fp16
=
input
->
inputs
().
front
();
}
else
{
input_fp16
=
insert_
fp16
(
prog
,
input
,
shape
::
half_type
,
map_fp16
);
input_fp16
=
insert_
quant_ins
(
prog
,
input
,
shape
::
half_type
,
map_fp16
);
}
converted_inputs
.
push_back
(
input_fp16
);
}
...
...
@@ -79,21 +101,13 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
auto
ins_shape
=
compute_shape
(
op
,
converted_inputs
);
if
(
ins_shape
.
type
()
!=
orig_type
)
{
// insert another convert instruction to convert it back
if
(
ins
==
std
::
prev
(
prog
.
end
()))
// check the dead code case to avoid assert
bool
output_empty
=
ins
->
outputs
().
empty
();
auto
ins_orig_type
=
prog
.
insert_instruction
(
std
::
next
(
ins
),
op
::
convert
{
orig_type
},
ins
);
if
(
!
output_empty
)
{
prog
.
add_instruction
(
op
::
convert
{
orig_type
},
ins
);
}
else
{
// check the dead code case to avoid assert
bool
output_empty
=
ins
->
outputs
().
empty
();
auto
ins_orig_type
=
prog
.
insert_instruction
(
std
::
next
(
ins
),
op
::
convert
{
orig_type
},
ins
);
if
(
!
output_empty
)
{
prog
.
replace_instruction
(
ins
,
ins_orig_type
);
}
prog
.
replace_instruction
(
ins
,
ins_orig_type
);
}
}
...
...
@@ -103,5 +117,91 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
void
quantize
(
program
&
prog
)
{
quantize
(
prog
,
{
"all"
});
}
// For the input of each input argument, we need to insert a
// capture operator to compute the scale and shift
std
::
size_t
capture_arguments
(
program
&
prog
,
const
std
::
vector
<
std
::
string
>&
ins_names
,
const
std
::
function
<
void
(
std
::
size_t
,
std
::
vector
<
argument
>
)
>&
func
)
{
size_t
num_quant_params
=
0
;
// the int8 quantization only support dot and convolution
std
::
vector
<
std
::
string
>
op_names
=
{
"dot"
,
"convolution"
};
if
(
!
std
::
all_of
(
ins_names
.
begin
(),
ins_names
.
end
(),
[
&
](
auto
name
)
{
return
std
::
find
(
op_names
.
begin
(),
op_names
.
end
(),
name
)
!=
op_names
.
end
();
}))
{
MIGRAPHX_THROW
(
"CAPTURE_ARGUMENTS: input operator is not supported"
);
}
std
::
unordered_map
<
instruction_ref
,
instruction_ref
>
ins_map
;
for
(
auto
ins
:
iterator_for
(
prog
))
{
if
(
not
contains
(
ins_names
,
ins
->
name
()))
{
continue
;
}
auto
inputs
=
ins
->
inputs
();
std
::
vector
<
instruction_ref
>
new_args
;
for
(
auto
input
:
inputs
)
{
instruction_ref
new_ins
{};
if
(
ins_map
.
count
(
input
)
>
0
)
{
new_ins
=
ins_map
[
input
];
}
else
{
new_ins
=
prog
.
insert_instruction
(
std
::
next
(
input
),
op
::
capture
{
num_quant_params
++
,
func
},
input
);
ins_map
[
input
]
=
new_ins
;
}
new_args
.
push_back
(
new_ins
);
}
instruction
::
replace
(
ins
,
ins
->
get_operator
(),
ins
->
get_shape
(),
new_args
);
}
return
num_quant_params
;
}
std
::
shared_ptr
<
std
::
vector
<
std
::
pair
<
float
,
float
>>>
capture_arguments
(
program
&
prog
,
const
std
::
vector
<
std
::
string
>&
ins_names
)
{
std
::
shared_ptr
<
std
::
vector
<
std
::
pair
<
float
,
float
>>>
int8_quant_params
=
std
::
make_shared
<
std
::
vector
<
std
::
pair
<
float
,
float
>>>
();
std
::
shared_ptr
<
std
::
vector
<
float
>>
max_abs_vals
=
std
::
make_shared
<
std
::
vector
<
float
>>
();
auto
calc_quant_params
=
[
int8_quant_params
,
max_abs_vals
](
std
::
size_t
ins_index
,
std
::
vector
<
migraphx
::
argument
>
args
)
{
std
::
pair
<
float
,
float
>
param_pair
{
64.0
f
,
0.0
f
};
// scale and shift is need for only int8 type, and we do not
// consider shift, so set shift to 0
std
::
vector
<
float
>
vec_val
;
args
.
front
().
visit
([
&
](
auto
output
)
{
vec_val
.
assign
(
output
.
begin
(),
output
.
end
());
});
auto
max_val
=
*
std
::
max_element
(
vec_val
.
begin
(),
vec_val
.
end
());
auto
min_val
=
*
std
::
min_element
(
vec_val
.
begin
(),
vec_val
.
end
());
auto
max_abs
=
std
::
max
(
std
::
fabs
(
max_val
),
std
::
fabs
(
min_val
));
max_abs_vals
->
at
(
ins_index
)
=
std
::
max
(
max_abs_vals
->
at
(
ins_index
),
max_abs
);
param_pair
.
first
=
127.0
f
/
max_abs_vals
->
at
(
ins_index
);
int8_quant_params
->
at
(
ins_index
)
=
param_pair
;
};
auto
num_params
=
capture_arguments
(
prog
,
ins_names
,
calc_quant_params
);
int8_quant_params
->
resize
(
num_params
,
std
::
pair
<
float
,
float
>
(
64.0
f
,
0.0
f
));
max_abs_vals
->
resize
(
num_params
,
0.0
f
);
return
int8_quant_params
;
}
std
::
shared_ptr
<
std
::
vector
<
std
::
pair
<
float
,
float
>>>
capture_arguments
(
program
&
prog
)
{
std
::
vector
<
std
::
string
>
ins_names
=
{
"dot"
,
"convolution"
};
return
capture_arguments
(
prog
,
ins_names
);
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/rewrite_rnn.cpp
View file @
98fd5e1d
...
...
@@ -674,7 +674,6 @@ void rewrite_rnn::apply_lstm(program& prog, instruction_ref ins) const
std
::
vector
<
float
>
ihc_data
(
ihc_shape
.
elements
(),
0.0
);
migraphx
::
shape
pph_shape
{
type
,
{
1
,
3
*
hidden_size
}};
std
::
vector
<
float
>
pph_data
(
pph_shape
.
elements
(),
0.0
);
auto
actv_funcs
=
lstm_actv_funcs
(
ins
);
auto
lstm_op
=
any_cast
<
op
::
lstm
>
(
ins
->
get_operator
());
...
...
src/simplify_algebra.cpp
View file @
98fd5e1d
...
...
@@ -52,6 +52,7 @@ struct find_mul_conv
}
};
// a * (x + b) => a * x + a * b
struct
find_mul_add
{
auto
matcher
()
const
...
...
@@ -60,7 +61,7 @@ struct find_mul_add
match
::
name
(
"add"
)(
match
::
either_arg
(
0
,
1
)(
match
::
any
().
bind
(
"x"
),
match
::
any_of
(
conv_const_weights
(),
match
::
is_constant
()).
bind
(
"
y
"
)),
match
::
any_of
(
conv_const_weights
(),
match
::
is_constant
()).
bind
(
"
b
"
)),
match
::
none_of
(
match
::
args
(
match
::
is_constant
(),
match
::
is_constant
())),
match
::
used_once
()),
match
::
is_constant
().
bind
(
"a"
)));
...
...
@@ -70,12 +71,13 @@ struct find_mul_add
{
auto
ins
=
r
.
result
;
auto
a_ins
=
r
.
instructions
[
"a"
];
auto
b_ins
=
r
.
instructions
[
"b"
];
auto
x_ins
=
r
.
instructions
[
"x"
];
a
uto
y
_ins
=
r
.
instructions
[
"y"
]
;
a
ssert
(
x
_ins
!
=
b_ins
)
;
auto
x
a_ins
=
p
.
insert_instruction
(
ins
,
op
::
mul
{},
x
_ins
,
a
_ins
);
auto
y
a_ins
=
p
.
insert_instruction
(
ins
,
op
::
mul
{},
y
_ins
,
a
_ins
);
p
.
replace_instruction
(
ins
,
op
::
add
{},
x
a_ins
,
y
a_ins
);
auto
a
x
_ins
=
p
.
insert_instruction
(
ins
,
op
::
mul
{},
a
_ins
,
x
_ins
);
auto
a
b
_ins
=
p
.
insert_instruction
(
ins
,
op
::
mul
{},
a
_ins
,
b
_ins
);
p
.
replace_instruction
(
ins
,
op
::
add
{},
a
x
_ins
,
a
b
_ins
);
}
};
...
...
src/targets/cpu/gemm.cpp
View file @
98fd5e1d
...
...
@@ -44,13 +44,9 @@ struct is_fast_gemm_type<float> : std::true_type
{
};
template
<
class
T
>
void
migemm_impl
(
tensor_view
<
T
>
cmat
,
tensor_view
<
T
>
amat
,
tensor_view
<
T
>
bmat
,
float
alpha
,
float
beta
,
std
::
true_type
)
template
<
class
T
,
class
F
>
void
migemm_impl
(
tensor_view
<
T
>
cmat
,
tensor_view
<
T
>
amat
,
tensor_view
<
T
>
bmat
,
F
alpha
,
F
beta
,
std
::
true_type
)
{
visit_mat
(
amat
,
[
&
](
const
auto
&
a
)
{
visit_mat
(
bmat
,
[
&
](
const
auto
&
b
)
{
...
...
@@ -66,13 +62,9 @@ void migemm_impl(tensor_view<T> cmat,
});
}
template
<
class
T
>
void
migemm_impl
(
tensor_view
<
T
>
cmat
,
tensor_view
<
T
>
amat
,
tensor_view
<
T
>
bmat
,
float
alpha
,
float
beta
,
std
::
false_type
)
template
<
class
T
,
class
F
>
void
migemm_impl
(
tensor_view
<
T
>
cmat
,
tensor_view
<
T
>
amat
,
tensor_view
<
T
>
bmat
,
F
alpha
,
F
beta
,
std
::
false_type
)
{
std
::
size_t
n_dims
=
cmat
.
get_shape
().
lens
().
size
();
std
::
size_t
dim_0
=
n_dims
-
2
;
...
...
@@ -95,9 +87,8 @@ void migemm_impl(tensor_view<T> cmat,
});
}
template
<
class
T
>
void
migemm_impl
(
tensor_view
<
T
>
cmat
,
tensor_view
<
T
>
amat
,
tensor_view
<
T
>
bmat
,
float
alpha
,
float
beta
)
template
<
class
T
,
class
F
>
void
migemm_impl
(
tensor_view
<
T
>
cmat
,
tensor_view
<
T
>
amat
,
tensor_view
<
T
>
bmat
,
F
alpha
,
F
beta
)
{
auto
lens
=
amat
.
get_shape
().
lens
();
bool
batch_mul
=
...
...
@@ -113,13 +104,29 @@ void migemm_impl(
}
}
void
migemm
(
const
argument
&
c_arg
,
const
argument
&
a_arg
,
const
argument
&
b_arg
,
float
alpha
,
float
beta
)
template
<
class
F
>
void
migemm_tpl
(
const
argument
&
c_arg
,
const
argument
&
a_arg
,
const
argument
&
b_arg
,
F
alpha
,
F
beta
)
{
visit_all
(
c_arg
,
a_arg
,
b_arg
)(
[
&
](
auto
cmat
,
auto
amat
,
auto
bmat
)
{
migemm_impl
(
cmat
,
amat
,
bmat
,
alpha
,
beta
);
});
}
void
migemm
(
const
argument
&
c_arg
,
const
argument
&
a_arg
,
const
argument
&
b_arg
,
float
alpha
,
float
beta
)
{
migemm_tpl
(
c_arg
,
a_arg
,
b_arg
,
alpha
,
beta
);
}
void
migemm
(
const
argument
&
c_arg
,
const
argument
&
a_arg
,
const
argument
&
b_arg
,
int32_t
alpha
,
int32_t
beta
)
{
migemm_tpl
(
c_arg
,
a_arg
,
b_arg
,
alpha
,
beta
);
}
}
// namespace cpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/cpu/include/migraphx/cpu/gemm.hpp
View file @
98fd5e1d
...
...
@@ -10,6 +10,11 @@ namespace cpu {
void
migemm
(
const
argument
&
c_arg
,
const
argument
&
a_arg
,
const
argument
&
b_arg
,
float
alpha
,
float
beta
);
void
migemm
(
const
argument
&
c_arg
,
const
argument
&
a_arg
,
const
argument
&
b_arg
,
int32_t
alpha
,
int32_t
beta
);
}
// namespace cpu
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/cpu/lowering.cpp
View file @
98fd5e1d
...
...
@@ -4,7 +4,9 @@
#include <migraphx/dfor.hpp>
#include <migraphx/op/batch_norm.hpp>
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/op/dot.hpp>
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/op/elu.hpp>
#include <migraphx/op/im2col.hpp>
#include <migraphx/op/leaky_relu.hpp>
...
...
@@ -216,6 +218,61 @@ struct cpu_convolution
}
};
struct
cpu_quant_convolution
{
op
::
quant_convolution
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
std
::
string
name
()
const
{
return
"cpu::quant_convolution"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
shape
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
auto
output
=
result
.
get
<
int32_t
>
();
visit_all
(
args
[
0
],
args
[
1
])([
&
](
auto
input
,
auto
weights
)
{
auto
in
=
input
.
get_shape
().
lens
();
auto
in_h
=
in
[
2
];
auto
in_w
=
in
[
3
];
auto
wei
=
weights
.
get_shape
().
lens
();
auto
wei_n
=
wei
[
0
];
auto
wei_c
=
wei
[
1
];
auto
wei_h
=
wei
[
2
];
auto
wei_w
=
wei
[
3
];
par_dfor
(
output_shape
.
lens
()[
0
],
output_shape
.
lens
()[
1
],
output_shape
.
lens
()[
2
],
output_shape
.
lens
()[
3
])(
[
&
](
std
::
size_t
o
,
std
::
size_t
w
,
std
::
size_t
i
,
std
::
size_t
j
)
{
const
auto
start_x
=
i
*
op
.
stride
[
0
]
-
op
.
padding
[
0
];
const
auto
start_y
=
j
*
op
.
stride
[
1
]
-
op
.
padding
[
1
];
const
auto
group_id
=
w
/
(
wei_n
/
op
.
group
);
int32_t
acc
=
0
;
dfor
(
wei_c
,
wei_h
,
wei_w
)([
&
](
std
::
size_t
k
,
std
::
size_t
x
,
std
::
size_t
y
)
{
const
auto
in_x
=
start_x
+
x
;
const
auto
in_y
=
start_y
+
y
;
const
auto
in_ch
=
group_id
*
wei_c
+
k
;
if
(
in_x
>=
0
&&
in_x
<
in_h
&&
in_y
>=
0
&&
in_y
<
in_w
)
{
acc
+=
static_cast
<
int32_t
>
(
input
(
o
,
in_ch
,
in_x
,
in_y
))
*
weights
(
w
,
k
,
x
,
y
);
}
});
output
(
o
,
w
,
i
,
j
)
=
acc
;
});
});
return
result
;
}
};
struct
cpu_im2col
{
op
::
im2col
op
;
...
...
@@ -245,17 +302,17 @@ struct cpu_im2col
const
std
::
size_t
&
stride_h
=
op
.
stride
[
0
];
const
std
::
size_t
&
stride_w
=
op
.
stride
[
1
];
auto
kdiv2_h
=
kernel_h
/
2
;
auto
kdiv2_w
=
kernel_w
/
2
;
long
kdiv2_h
=
long
(
kernel_h
)
/
2
;
long
kdiv2_w
=
long
(
kernel_w
)
/
2
;
// calculate output sizes
const
std
::
size_t
col_height
=
(
height
-
kernel_h
+
2
*
pad_h
)
/
stride_h
+
1
;
const
std
::
size_t
col_width
=
(
width
-
kernel_w
+
2
*
pad_w
)
/
stride_w
+
1
;
// account for padding for the starting position of the input pixels
std
::
size_t
iinput
=
kdiv2_h
-
pad_h
;
long
iinput
=
kdiv2_h
-
long
(
pad_h
)
;
// loop over output pixels (ioutput, joutput)
for
(
std
::
size_t
ioutput
=
0
;
ioutput
<
col_height
;
ioutput
++
,
iinput
+=
stride_h
)
{
std
::
size_t
jinput
=
kdiv2_w
-
pad_w
;
long
jinput
=
kdiv2_w
-
long
(
pad_w
)
;
for
(
std
::
size_t
joutput
=
0
;
joutput
<
col_width
;
joutput
++
,
jinput
+=
stride_w
)
{
// compute linear index for output
...
...
@@ -264,8 +321,8 @@ struct cpu_im2col
dfor
(
channels
,
kernel_h
,
kernel_w
)([
&
](
std
::
size_t
c
,
std
::
size_t
koffset
,
std
::
size_t
loffset
)
{
auto
idx
=
iinput
+
koffset
-
kdiv2_h
;
auto
jdx
=
jinput
+
loffset
-
kdiv2_w
;
auto
idx
=
iinput
+
long
(
koffset
)
-
kdiv2_h
;
auto
jdx
=
jinput
+
long
(
loffset
)
-
kdiv2_w
;
col
(
ldx
,
p
)
=
((
idx
>=
0
)
&&
(
idx
<
height
)
&&
(
jdx
>=
0
)
&&
(
jdx
<
width
))
?
input
(
0
,
c
,
idx
,
jdx
)
:
0
;
...
...
@@ -433,7 +490,7 @@ struct cpu_gemm
{
argument
result
{
output_shape
};
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrics, and C is
broadcastable to
A * B
// A and B are matric
e
s, and C is
of the same shape as
A * B
if
(
args
.
size
()
==
3
)
{
// no need to consider the value of args[2]
...
...
@@ -460,13 +517,79 @@ struct cpu_gemm
}
};
struct
cpu_quant_gemm
{
op
::
quant_dot
op
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
std
::
string
name
()
const
{
return
"cpu::quant_dot"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
if
(
inputs
.
size
()
==
3
)
{
auto
c_shape
=
inputs
.
at
(
2
);
check_shapes
{{
c_shape
}}.
not_broadcasted
();
}
return
op
.
compute_shape
(
inputs
);
}
argument
compute
(
context
&
,
const
shape
&
output_shape
,
std
::
vector
<
argument
>
args
)
const
{
argument
result
{
output_shape
};
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrices, and C is of the same shape to A * B
// first, convert the args[0] and args[1] from int8_t to int32_t
argument
arg_0
{{
shape
::
int32_type
,
{
args
.
at
(
0
).
get_shape
().
lens
()}}};
argument
arg_1
{{
shape
::
int32_type
,
{
args
.
at
(
1
).
get_shape
().
lens
()}}};
arg_0
.
visit
([
&
](
auto
output
)
{
args
.
at
(
0
).
visit
(
[
&
](
auto
input
)
{
std
::
copy
(
input
.
begin
(),
input
.
end
(),
output
.
begin
());
});
});
arg_1
.
visit
([
&
](
auto
output
)
{
args
.
at
(
1
).
visit
(
[
&
](
auto
input
)
{
std
::
copy
(
input
.
begin
(),
input
.
end
(),
output
.
begin
());
});
});
if
(
args
.
size
()
==
3
)
{
// no need to consider the value of args[2]
if
(
op
.
beta
==
0
)
{
result
.
visit
([
&
](
auto
output
)
{
std
::
fill
(
output
.
begin
(),
output
.
end
(),
0
);
});
}
else
{
visit_all
(
result
,
args
[
2
])([
&
](
auto
output
,
auto
input
)
{
std
::
copy
(
input
.
begin
(),
input
.
end
(),
output
.
begin
());
});
}
migemm
(
result
,
arg_0
,
arg_1
,
op
.
alpha
,
op
.
beta
);
return
result
;
}
// 2 input arguments
migemm
(
result
,
arg_0
,
arg_1
,
op
.
alpha
,
int32_t
{
0
});
return
result
;
}
};
struct
leaky_relu_op
{
op
::
leaky_relu
op
;
std
::
string
name
()
const
{
return
"cpu::leaky_relu"
;
}
auto
fcn
()
const
{
auto
&
a
=
op
.
alpha
;
auto
a
=
op
.
alpha
;
return
[
a
](
auto
x
)
{
return
x
>
0
?
x
:
x
*
a
;
};
}
};
...
...
@@ -477,7 +600,7 @@ struct elu_op
std
::
string
name
()
const
{
return
"cpu::elu"
;
}
auto
fcn
()
const
{
auto
&
a
=
op
.
alpha
;
auto
a
=
op
.
alpha
;
return
[
a
](
auto
x
)
{
return
x
>
0
?
x
:
a
*
std
::
expm1
(
x
);
};
}
};
...
...
@@ -671,15 +794,17 @@ struct cpu_apply
{
apply_map
[
"batch_norm_inference"
]
=
extend_op
<
cpu_batch_norm_inference
,
op
::
batch_norm_inference
>
();
apply_map
[
"convolution"
]
=
extend_op
<
cpu_convolution
,
op
::
convolution
>
();
apply_map
[
"dot"
]
=
extend_op
<
cpu_gemm
,
op
::
dot
>
();
apply_map
[
"elu"
]
=
extend_op
<
cpu_unary
<
elu_op
>
,
op
::
elu
>
();
apply_map
[
"im2col"
]
=
extend_op
<
cpu_im2col
,
op
::
im2col
>
();
apply_map
[
"leaky_relu"
]
=
extend_op
<
cpu_unary
<
leaky_relu_op
>
,
op
::
leaky_relu
>
();
apply_map
[
"logsoftmax"
]
=
extend_op
<
cpu_logsoftmax
,
op
::
logsoftmax
>
();
apply_map
[
"lrn"
]
=
extend_op
<
cpu_lrn
,
op
::
lrn
>
();
apply_map
[
"pad"
]
=
extend_op
<
cpu_pad
,
op
::
pad
>
();
apply_map
[
"softmax"
]
=
extend_op
<
cpu_softmax
,
op
::
softmax
>
();
apply_map
[
"convolution"
]
=
extend_op
<
cpu_convolution
,
op
::
convolution
>
();
apply_map
[
"dot"
]
=
extend_op
<
cpu_gemm
,
op
::
dot
>
();
apply_map
[
"quant_dot"
]
=
extend_op
<
cpu_quant_gemm
,
op
::
quant_dot
>
();
apply_map
[
"quant_convolution"
]
=
extend_op
<
cpu_quant_convolution
,
op
::
quant_convolution
>
();
apply_map
[
"elu"
]
=
extend_op
<
cpu_unary
<
elu_op
>
,
op
::
elu
>
();
apply_map
[
"im2col"
]
=
extend_op
<
cpu_im2col
,
op
::
im2col
>
();
apply_map
[
"leaky_relu"
]
=
extend_op
<
cpu_unary
<
leaky_relu_op
>
,
op
::
leaky_relu
>
();
apply_map
[
"logsoftmax"
]
=
extend_op
<
cpu_logsoftmax
,
op
::
logsoftmax
>
();
apply_map
[
"lrn"
]
=
extend_op
<
cpu_lrn
,
op
::
lrn
>
();
apply_map
[
"pad"
]
=
extend_op
<
cpu_pad
,
op
::
pad
>
();
apply_map
[
"softmax"
]
=
extend_op
<
cpu_softmax
,
op
::
softmax
>
();
}
void
apply
()
...
...
src/targets/gpu/CMakeLists.txt
View file @
98fd5e1d
...
...
@@ -34,16 +34,19 @@ add_library(migraphx_device
device/contiguous.cpp
device/logsoftmax.cpp
device/softmax.cpp
device/sigmoid.cpp
device/convert.cpp
device/mul.cpp
device/concat.cpp
device/pad.cpp
device/gather.cpp
device/sub.cpp
device/int8_gemm_pack.cpp
device/div.cpp
device/clip.cpp
device/reduce_sum.cpp
device/rsqrt.cpp
device/round.cpp
device/sqrt.cpp
device/reduce_mean.cpp
device/pow.cpp
...
...
@@ -65,8 +68,10 @@ add_library(migraphx_gpu
target.cpp
lowering.cpp
gemm.cpp
quant_gemm.cpp
pooling.cpp
convolution.cpp
quant_convolution.cpp
softmax.cpp
logsoftmax.cpp
contiguous.cpp
...
...
@@ -75,17 +80,20 @@ add_library(migraphx_gpu
batchnorm.cpp
write_literals.cpp
rocblas.cpp
sigmoid.cpp
abs.cpp
elu.cpp
pad.cpp
gather.cpp
convert.cpp
lrn.cpp
schedule_model.cpp
adjust_allocation.cpp
pack_int8_args.cpp
clip.cpp
reduce_sum.cpp
reduce_mean.cpp
int8_gemm_pack.cpp
int8_conv_pack.cpp
)
set_target_properties
(
migraphx_gpu PROPERTIES EXPORT_NAME gpu
)
rocm_clang_tidy_check
(
migraphx_gpu
)
...
...
src/targets/gpu/convert.cpp
0 → 100644
View file @
98fd5e1d
#include <migraphx/gpu/convert.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/convert.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
shape
hip_convert
::
compute_shape
(
std
::
vector
<
shape
>
inputs
)
const
{
inputs
.
pop_back
();
check_shapes
{
inputs
}.
packed
();
return
op
.
compute_shape
(
inputs
);
}
argument
hip_convert
::
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
device
::
convert
(
ctx
.
get_stream
().
get
(),
args
[
1
],
args
[
0
]);
return
args
[
1
];
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
View file @
98fd5e1d
...
...
@@ -155,8 +155,8 @@ __device__ void dpp_reduce(T& in, Op op)
__device__
inline
void
dpp_reduce
(
float
&
x
,
sum
)
{
#ifdef
MIGRAPHX_USE_CLANG_TIDY
(
void
)
x
;
#if
def
ined(
MIGRAPHX_USE_CLANG_TIDY
) || defined(CPPCHECK)
x
=
1
;
#else
__asm__
volatile
(
"s_nop 4
\n
"
"v_add_f32 %0 %0 %0 row_shr:1
\n
"
...
...
@@ -245,8 +245,7 @@ void reduce_standard_impl(hipStream_t stream,
T
init
,
Input
read_input
,
Output
read_output
,
std
::
size_t
relements
,
std
::
size_t
stride
)
std
::
size_t
relements
)
{
hip_visit_all
(
result
,
arg
)([
&
](
auto
output
,
auto
input
)
{
auto
nelements
=
result
.
get_shape
().
elements
();
...
...
@@ -255,7 +254,7 @@ void reduce_standard_impl(hipStream_t stream,
const
std
::
size_t
block_size
=
compute_block_size
(
relements
,
max_block_size
);
gs_launch
(
stream
,
nelements
*
block_size
,
block_size
)([
=
](
auto
i
,
auto
idx
)
__device__
{
const
auto
out_idx
=
i
/
block_size
;
const
auto
base_idx
=
out_idx
*
stride
;
const
auto
base_idx
=
out_idx
*
relements
;
auto
r
=
block_reduce
<
max_block_size
>
(
idx
,
op
,
init
,
relements
,
[
&
](
auto
j
)
__device__
{
return
read_input
(
input
.
data
()[
base_idx
+
j
]);
});
...
...
@@ -276,25 +275,15 @@ void reduce(hipStream_t stream,
{
auto
&&
output_shape
=
result
.
get_shape
();
auto
&&
input_shape
=
arg
.
get_shape
();
assert
(
output_shape
.
lens
().
size
()
==
input_shape
.
lens
().
size
());
if
(
input_shape
.
standard
()
and
output_shape
.
standard
()
and
output_shape
.
lens
().
back
()
!=
input_shape
.
lens
().
back
()
and
std
::
equal
(
output_shape
.
lens
().
begin
(),
std
::
prev
(
output_shape
.
lens
().
end
()),
input_shape
.
lens
().
begin
()))
{
std
::
size_t
stride
=
std
::
accumulate
(
input_shape
.
strides
().
begin
(),
input_shape
.
strides
().
end
(),
1
,
std
::
multiplies
<
size_t
>
());
reduce_standard_impl
(
stream
,
result
,
arg
,
op
,
init
,
read_input
,
read_output
,
input_shape
.
lens
().
back
(),
stride
);
reduce_standard_impl
(
stream
,
result
,
arg
,
op
,
init
,
read_input
,
read_output
,
input_shape
.
lens
().
back
());
}
else
{
...
...
src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
View file @
98fd5e1d
...
...
@@ -31,6 +31,7 @@ struct hip_tensor_descriptor
result
[
is
]
=
tidx
/
strides
[
is
];
tidx
=
tidx
%
strides
[
is
];
}
return
result
;
}
__device__
__host__
std
::
size_t
linear
(
hip_tensor_index
<
NDim
>
s
)
const
...
...
src/targets/gpu/device/int8_gemm_pack.cpp
0 → 100644
View file @
98fd5e1d
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/int8_gemm_pack.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/hip.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
int8_gemm_pack_a
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
comp_shape
=
arg
.
get_shape
();
auto
out_lens
=
comp_shape
.
lens
();
auto
dim_0
=
out_lens
.
size
()
-
2
;
auto
dim_1
=
out_lens
.
size
()
-
1
;
std
::
size_t
lda
=
comp_shape
.
strides
()[
dim_0
];
std
::
size_t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
visit_all
(
result
,
arg
)([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
comp_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
out_lens
.
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
comp_shape
);
gs_launch
(
stream
,
nelements
,
256
)([
=
](
auto
ii
)
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_m
=
idx
[
dim_1
];
std
::
size_t
i_k
=
idx
[
dim_0
];
std
::
size_t
offset
=
ii
/
m_size
*
m_size
;
out_ptr
[
i_k
%
nb
+
(
i_m
+
(
i_k
/
nb
)
*
lda
)
*
nb
+
offset
]
=
in_ptr
[
i_m
+
i_k
*
lda
+
offset
];
});
});
});
}
void
int8_gemm_pack_b
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
trans_shape
=
arg
.
get_shape
();
auto
out_lens
=
trans_shape
.
lens
();
auto
dim_0
=
trans_shape
.
lens
().
size
()
-
2
;
auto
dim_1
=
trans_shape
.
lens
().
size
()
-
1
;
std
::
size_t
ldb
=
trans_shape
.
strides
()[
dim_1
];
auto
wrap_lens
=
out_lens
;
std
::
swap
(
wrap_lens
[
dim_0
],
wrap_lens
[
dim_1
]);
shape
comp_shape
{
trans_shape
.
type
(),
wrap_lens
};
std
::
size_t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
visit_all
(
result
,
arg
)([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
comp_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
out_lens
.
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
comp_shape
);
gs_launch
(
stream
,
nelements
,
256
)([
=
](
auto
ii
)
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_n
=
idx
[
dim_1
];
std
::
size_t
i_k
=
idx
[
dim_0
];
std
::
size_t
offset
=
ii
/
m_size
*
m_size
;
out_ptr
[
i_k
%
nb
+
(
i_n
+
(
i_k
/
nb
)
*
ldb
)
*
nb
+
offset
]
=
in_ptr
[
i_n
+
i_k
*
ldb
+
offset
];
});
});
});
}
void
sync_stream
(
hipStream_t
stream
)
{
hipStreamSynchronize
(
stream
);
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/round.cpp
0 → 100644
View file @
98fd5e1d
#include <migraphx/gpu/device/round.hpp>
#include <migraphx/gpu/device/nary.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
round
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
nary
(
stream
,
result
,
arg
)([](
auto
x
)
{
return
::
round
(
to_hip_type
(
x
));
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/sigmoid.cpp
0 → 100644
View file @
98fd5e1d
#include <migraphx/gpu/device/sigmoid.hpp>
#include <migraphx/gpu/device/nary.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
sigmoid
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
nary
(
stream
,
result
,
arg
)([](
auto
x
)
{
return
1.
f
/
(
1.
f
+
::
exp
(
to_hip_type
(
-
x
)));
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/gemm.cpp
View file @
98fd5e1d
...
...
@@ -233,6 +233,10 @@ argument miopen_gemm::compute(context& ctx,
auto
to_pointer
=
[
&
](
auto
&&
arg
)
{
return
to_rocblas_type
(
as
.
from
(
arg
.
data
()));
};
if
(
num_matrices
==
1
)
{
// the rocblas_gemm API handles inputs and output matrices as
// column-major format. When doing a C = A * B, we actually do
// C^T = (B^T) * (A^T). That is the reason we input args[1] as
// A and args[0] as B in calling the rocblas_gemm.
generic_rocblas_gemm
(
as
,
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment