Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
0662a9a3
Unverified
Commit
0662a9a3
authored
Nov 09, 2023
by
Brian Pickrell
Committed by
GitHub
Nov 09, 2023
Browse files
Merge branch 'develop' into dyn_resize_gather
parents
b74d3a8f
35e5298e
Changes
130
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
804 additions
and
439 deletions
+804
-439
src/onnx/parse_multinomial.cpp
src/onnx/parse_multinomial.cpp
+74
-16
src/onnx/parse_resize.cpp
src/onnx/parse_resize.cpp
+71
-3
src/onnx/parse_slice.cpp
src/onnx/parse_slice.cpp
+7
-5
src/onnx/parse_split.cpp
src/onnx/parse_split.cpp
+26
-5
src/program.cpp
src/program.cpp
+1
-1
src/py/migraphx_py.cpp
src/py/migraphx_py.cpp
+5
-2
src/quantization.cpp
src/quantization.cpp
+1
-1
src/rewrite_quantization.cpp
src/rewrite_quantization.cpp
+1
-1
src/simplify_dyn_ops.cpp
src/simplify_dyn_ops.cpp
+46
-2
src/simplify_reshapes.cpp
src/simplify_reshapes.cpp
+2
-2
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+24
-22
src/targets/gpu/compile_hip.cpp
src/targets/gpu/compile_hip.cpp
+11
-8
src/targets/gpu/compile_miopen.cpp
src/targets/gpu/compile_miopen.cpp
+4
-15
src/targets/gpu/compile_ops.cpp
src/targets/gpu/compile_ops.cpp
+10
-7
src/targets/gpu/device/int8_gemm_pack.cpp
src/targets/gpu/device/int8_gemm_pack.cpp
+0
-97
src/targets/gpu/gemm_impl.cpp
src/targets/gpu/gemm_impl.cpp
+474
-157
src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/convolution.hpp
src/targets/gpu/include/migraphx/gpu/convolution.hpp
+9
-31
src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
...argets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
+0
-49
src/targets/gpu/include/migraphx/gpu/gemm.hpp
src/targets/gpu/include/migraphx/gpu/gemm.hpp
+37
-14
No files found.
src/onnx/parse_multinomial.cpp
View file @
0662a9a3
/*
/*
* The MIT License (MIT)
* The MIT License (MIT)
*
*
* Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* of this software and associated documentation files (the "Software"), to deal
...
@@ -41,6 +41,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
...
@@ -41,6 +41,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
const
onnx_parser
::
node_info
&
info
,
const
onnx_parser
::
node_info
&
info
,
std
::
vector
<
instruction_ref
>
args
)
const
std
::
vector
<
instruction_ref
>
args
)
const
{
{
if
(
args
.
empty
())
MIGRAPHX_THROW
(
"PARSE_MULTINOMIAL: no arguments given"
);
int
dtype
=
6
;
int
dtype
=
6
;
if
(
contains
(
info
.
attributes
,
"dtype"
))
if
(
contains
(
info
.
attributes
,
"dtype"
))
dtype
=
info
.
attributes
.
at
(
"dtype"
).
i
();
dtype
=
info
.
attributes
.
at
(
"dtype"
).
i
();
...
@@ -49,35 +52,90 @@ struct parse_multinomial : op_parser<parse_multinomial>
...
@@ -49,35 +52,90 @@ struct parse_multinomial : op_parser<parse_multinomial>
size_t
sample_size
=
1
;
size_t
sample_size
=
1
;
if
(
contains
(
info
.
attributes
,
"sample_size"
))
if
(
contains
(
info
.
attributes
,
"sample_size"
))
sample_size
=
info
.
attributes
.
at
(
"sample_size"
).
i
();
sample_size
=
info
.
attributes
.
at
(
"sample_size"
).
i
();
else
MIGRAPHX_THROW
(
"PARSE_MULTINOMIAL: sample_size not given"
);
// Use logarithmic math to scale probabilities while avoiding division by very
// small numbers. Scaling by the maximum makes very tiny ranges more
// tractable; any constant factor gives equivalent distr. since the Multinomial op.
// normalizes at runtime.
// Subtract the per-batch maximum log-probability, making the per-batch max 0
// Subtract the per-batch maximum log-probability, making the per-batch max 0
auto
maxes
=
auto
maxes
=
info
.
add_instruction
(
migraphx
::
make_op
(
"reduce_max"
,
{{
"axes"
,
{
1
}}}),
args
[
0
]);
info
.
add_instruction
(
migraphx
::
make_op
(
"reduce_max"
,
{{
"axes"
,
{
1
}}}),
args
[
0
]);
auto
mb_maxes
=
info
.
add_instruction
(
auto
cdf
=
info
.
add_common_op
(
"sub"
,
args
[
0
],
maxes
);
migraphx
::
make_op
(
"multibroadcast"
,
{{
"out_lens"
,
args
[
0
]
->
get_shape
().
lens
()}}),
maxes
);
auto
cdf
=
info
.
add_instruction
(
migraphx
::
make_op
(
"sub"
),
args
[
0
],
mb_maxes
);
// Take the element-wise exponent to get probabilities in the range (0, 1]
// Take the element-wise exponent to get probabilities in the range (0, 1]
cdf
=
info
.
add_instruction
(
migraphx
::
make_op
(
"exp"
),
cdf
);
cdf
=
info
.
add_instruction
(
migraphx
::
make_op
(
"exp"
),
cdf
);
// Compute the cumulative d
ensity
function
// Compute the cumulative d
istribution
function
cdf
=
info
.
add_instruction
(
cdf
=
info
.
add_instruction
(
migraphx
::
make_op
(
"prefix_scan_sum"
,
{{
"axis"
,
1
},
{
"exclusive"
,
false
}}),
cdf
);
migraphx
::
make_op
(
"prefix_scan_sum"
,
{{
"axis"
,
1
},
{
"exclusive"
,
false
}}),
cdf
);
// Pre-compute random distribution
instruction_ref
seed_input
;
std
::
mt19937
gen
(
std
::
chrono
::
high_resolution_clock
::
now
().
time_since_epoch
().
count
());
if
(
contains
(
info
.
attributes
,
"seed"
))
if
(
contains
(
info
.
attributes
,
"seed"
))
gen
.
seed
(
info
.
attributes
.
at
(
"seed"
).
f
());
{
float
seed
=
info
.
attributes
.
at
(
"seed"
).
f
();
migraphx
::
shape
s
{
migraphx
::
shape
::
float_type
,
{
1
}};
std
::
vector
<
float
>
data
=
{
seed
};
seed_input
=
info
.
add_literal
(
migraphx
::
literal
(
s
,
data
));
}
else
{
seed_input
=
info
.
add_instruction
(
migraphx
::
make_op
(
"random_seed"
));
}
instruction_ref
randoms
;
shape
s0
=
args
[
0
]
->
get_shape
();
if
(
s0
.
dynamic
())
{
// Dynamic batch_size will be taken from args[0]. The input argument to this should
// have a second dimension of sample_size.
std
::
vector
<
shape
::
dynamic_dimension
>
dyn_dim_set
;
dyn_dim_set
.
emplace_back
(
s0
.
dyn_dims
().
front
());
dyn_dim_set
.
emplace_back
(
shape
::
dynamic_dimension
{
sample_size
,
sample_size
});
// read the input dimensions
auto
dim_of
=
info
.
add_instruction
(
migraphx
::
make_op
(
"dimensions_of"
,
{{
"end"
,
2
}}),
args
[
0
]);
// The next two operations insert the value sample_size into the second array position
// make an argument of (1, 0)
shape
s
(
shape
::
int64_type
,
{
2
});
std
::
vector
<
int64_t
>
data1
{
1
,
0
};
auto
l1
=
info
.
add_literal
(
s
,
data1
);
auto
batch_arg
=
info
.
add_instruction
(
migraphx
::
make_op
(
"mul"
),
dim_of
,
l1
);
std
::
vector
<
int64_t
>
data2
(
2
,
0
);
// make an argument of (0, sample_size)
data2
[
1
]
=
sample_size
;
auto
l2
=
info
.
add_literal
(
s
,
data2
);
auto
alloc_shape
=
info
.
add_instruction
(
migraphx
::
make_op
(
"add"
),
batch_arg
,
l2
);
// alloc_shape should contain the input-based shape dimensions as its values at runtime,
// and its own shape is {2}
// compile_shape is the shape used when compiling the Allocate op, and may be dynamic
migraphx
::
shape
compile_shape
=
migraphx
::
shape
(
s0
.
type
(),
{
s0
.
dyn_dims
().
front
(),
{
sample_size
,
sample_size
}});
std
::
uniform_real_distribution
<>
dis
(
0.0
,
1.0
);
// Allocate on-device storage for the random values
size_t
batch_size
=
args
[
0
]
->
get_shape
().
lens
().
front
();
auto
alloc
=
info
.
add_instruction
(
migraphx
::
shape
dist_shape
{
migraphx
::
shape
::
float_type
,
{
batch_size
,
sample_size
}};
migraphx
::
make_op
(
"allocate"
,
{{
"shape"
,
to_value
(
compile_shape
)}}),
alloc_shape
);
randoms
=
info
.
add_instruction
(
migraphx
::
make_op
(
"random_uniform"
),
seed_input
,
alloc
);
}
else
{
// use literal. The array populated by random_uniform may have any shape, as long its
// number of elements is batch_size * sample_size .
size_t
batch_size
=
s0
.
lens
().
front
();
auto
rand_dummy
=
info
.
add_literal
(
migraphx
::
literal
{
migraphx
::
shape
::
float_type
,
{
batch_size
*
sample_size
}});
std
::
vector
<
float
>
random_dist
(
batch_size
*
sample_size
);
randoms
=
std
::
generate
(
random_dist
.
begin
(),
random_dist
.
end
(),
[
&
]()
{
return
dis
(
gen
);
}
);
info
.
add_instruction
(
migraphx
::
make_op
(
"random_uniform"
),
seed_input
,
rand_dummy
);
auto
dist_lit
=
info
.
add_literal
(
migraphx
::
literal
{
dist_shape
,
random_dist
});
}
return
info
.
add_instruction
(
return
info
.
add_instruction
(
migraphx
::
make_op
(
"multinomial"
,
{{
"dtype"
,
output_type
}}),
cdf
,
dist_lit
);
migraphx
::
make_op
(
"multinomial"
,
{{
"dtype"
,
output_type
}}),
cdf
,
randoms
);
}
}
};
};
...
...
src/onnx/parse_resize.cpp
View file @
0662a9a3
...
@@ -181,6 +181,76 @@ static std::string get_nearest_mode(const onnx_parser::attribute_map& attr)
...
@@ -181,6 +181,76 @@ static std::string get_nearest_mode(const onnx_parser::attribute_map& attr)
return
nearest_mode
;
return
nearest_mode
;
}
}
static
std
::
vector
<
double
>
get_scales
(
const
onnx_parser
::
attribute_map
&
attr
)
{
std
::
vector
<
double
>
scales
;
if
(
contains
(
attr
,
"scales"
))
{
copy
(
attr
.
at
(
"scales"
).
floats
(),
std
::
back_inserter
(
scales
));
}
return
scales
;
}
static
void
parse_args
(
const
std
::
vector
<
instruction_ref
>&
args
,
const
std
::
vector
<
size_t
>&
in_lens
,
const
std
::
string
&
op_name
,
std
::
vector
<
double
>&
vec_scale
,
std
::
vector
<
std
::
size_t
>&
out_lens
)
{
for
(
const
auto
&
arg
:
args
)
{
if
(
arg
->
name
()
==
"undefined"
or
arg
==
args
.
front
())
{
continue
;
}
// skipped empty input
auto
lens
=
arg
->
get_shape
().
lens
();
if
(
lens
.
empty
())
{
continue
;
}
auto
type
=
arg
->
get_shape
().
type
();
// output size
if
(
type
==
shape
::
int64_type
)
{
auto
arg_out_s
=
arg
->
eval
();
check_arg_empty
(
arg_out_s
,
"PARSE_"
+
op_name
+
": dynamic output size is not supported!"
);
arg_out_s
.
visit
([
&
](
const
auto
&
ol
)
{
out_lens
.
assign
(
ol
.
begin
(),
ol
.
end
());
});
if
(
out_lens
.
size
()
!=
in_lens
.
size
())
{
MIGRAPHX_THROW
(
"PARSE_"
+
op_name
+
": specified output size does not match input size"
);
}
// compute the scale
vec_scale
.
resize
(
in_lens
.
size
());
std
::
transform
(
in_lens
.
begin
(),
in_lens
.
end
(),
out_lens
.
begin
(),
vec_scale
.
begin
(),
[](
auto
iss
,
auto
oss
)
{
return
1.0
*
oss
/
iss
;
});
}
else
{
// scale input
if
(
lens
[
0
]
==
in_lens
.
size
())
{
auto
arg_scale
=
arg
->
eval
();
check_arg_empty
(
arg_scale
,
"PARSE_"
+
op_name
+
": dynamic input scale is not supported!"
);
arg_scale
.
visit
([
&
](
const
auto
&
v
)
{
vec_scale
.
assign
(
v
.
begin
(),
v
.
end
());
});
}
}
}
}
struct
parse_resize
:
op_parser
<
parse_resize
>
struct
parse_resize
:
op_parser
<
parse_resize
>
{
{
std
::
vector
<
op_desc
>
operators
()
const
{
return
{{
"Resize"
},
{
"Upsample"
}};
}
std
::
vector
<
op_desc
>
operators
()
const
{
return
{{
"Resize"
},
{
"Upsample"
}};
}
...
@@ -309,7 +379,7 @@ struct parse_resize : op_parser<parse_resize>
...
@@ -309,7 +379,7 @@ struct parse_resize : op_parser<parse_resize>
std
::
vector
<
size_t
>
out_lens
(
in_s
.
ndim
());
std
::
vector
<
size_t
>
out_lens
(
in_s
.
ndim
());
// scale
// scale
std
::
vector
<
double
>
vec_scale
;
std
::
vector
<
double
>
vec_scale
=
get_scales
(
info
.
attributes
)
;
// Look at inputs and infer either output size or scale, depending on input type
// Look at inputs and infer either output size or scale, depending on input type
for
(
const
auto
&
arg
:
args
)
for
(
const
auto
&
arg
:
args
)
...
@@ -381,8 +451,6 @@ struct parse_resize : op_parser<parse_resize>
...
@@ -381,8 +451,6 @@ struct parse_resize : op_parser<parse_resize>
}
}
}
}
// Dynamic batch: Only args[0] can have a dynamic shape, only the 0'th
// dimension--batch size--can be non-fixed, and the only resize mode allowed is "nearest"
if
(
args
[
0
]
->
get_shape
().
dynamic
())
if
(
args
[
0
]
->
get_shape
().
dynamic
())
{
{
return
dynamic_nearest_parse
(
out_lens
,
vec_scale
,
opd
,
info
,
args
);
return
dynamic_nearest_parse
(
out_lens
,
vec_scale
,
opd
,
info
,
args
);
...
...
src/onnx/parse_slice.cpp
View file @
0662a9a3
...
@@ -46,6 +46,9 @@ struct parse_slice : op_parser<parse_slice>
...
@@ -46,6 +46,9 @@ struct parse_slice : op_parser<parse_slice>
void
always_insert
(
instruction_ref
arg
)
{
op_args
.
insert
(
op_args
.
begin
(),
arg
);
}
void
always_insert
(
instruction_ref
arg
)
{
op_args
.
insert
(
op_args
.
begin
(),
arg
);
}
/**
* Either insert argument into `this->op_args` or return the constant value of the argument
*/
std
::
vector
<
int64_t
>
insert
(
instruction_ref
arg
)
std
::
vector
<
int64_t
>
insert
(
instruction_ref
arg
)
{
{
std
::
vector
<
int64_t
>
result
;
std
::
vector
<
int64_t
>
result
;
...
@@ -137,23 +140,22 @@ struct parse_slice : op_parser<parse_slice>
...
@@ -137,23 +140,22 @@ struct parse_slice : op_parser<parse_slice>
sd
.
always_insert
(
args
.
at
(
0
));
sd
.
always_insert
(
args
.
at
(
0
));
// If axes arg is not given, the default is all of them.
// If axes arg is not given, the default is all of them.
if
(
sd
.
op
.
axes
.
empty
()
and
sd
.
op_args
.
size
()
<
3
)
if
(
sd
.
op
.
axes
.
empty
()
and
sd
.
op_args
.
size
()
<
=
3
)
{
{
std
::
vector
<
int64_t
>
axes
(
args
[
0
]
->
get_shape
().
ndim
());
std
::
vector
<
int64_t
>
axes
(
args
[
0
]
->
get_shape
().
ndim
());
std
::
iota
(
axes
.
begin
(),
axes
.
end
(),
int64_t
{
0
});
std
::
iota
(
axes
.
begin
(),
axes
.
end
(),
int64_t
{
0
});
sd
.
op
.
axes
=
axes
;
sd
.
op
.
axes
=
axes
;
}
}
if
(
not
sd
.
steps
.
empty
(
))
if
(
std
::
any_of
(
sd
.
steps
.
begin
(),
sd
.
steps
.
end
(),
[](
auto
s
)
{
return
s
!=
1
;
}
))
{
{
if
(
sd
.
op
.
starts
.
empty
()
or
sd
.
op
.
ends
.
empty
())
if
(
sd
.
op
.
starts
.
empty
()
or
sd
.
op
.
ends
.
empty
())
MIGRAPHX_THROW
(
"PARSE_SLICE: steps and variable starts and ends is not supported"
);
MIGRAPHX_THROW
(
"PARSE_SLICE: steps and variable starts and/or ends is not supported"
);
if
(
sd
.
op
.
axes
.
empty
())
if
(
sd
.
op
.
axes
.
empty
())
MIGRAPHX_THROW
(
"PARSE_SLICE: steps and variable axes is not supported"
);
MIGRAPHX_THROW
(
"PARSE_SLICE: steps and variable axes is not supported"
);
}
}
assert
(
sd
.
steps
.
empty
()
or
sd
.
steps
.
size
()
==
sd
.
op
.
axes
.
size
());
// If any axes have negative step, prepare to add a "reverse" op
// If any axes have negative step, prepare to add a "reverse" op
for
(
auto
i
:
range
(
sd
.
steps
.
size
()))
for
(
auto
i
:
range
(
sd
.
steps
.
size
()))
{
{
...
...
src/onnx/parse_split.cpp
View file @
0662a9a3
...
@@ -68,13 +68,34 @@ struct parse_split : op_parser<parse_split>
...
@@ -68,13 +68,34 @@ struct parse_split : op_parser<parse_split>
// no split attribute, input is equally divided
// no split attribute, input is equally divided
else
else
{
{
if
((
lens
[
tuned_axis
]
%
info
.
num_outputs
)
!=
0
)
std
::
size_t
num_outputs
=
info
.
num_outputs
;
// the num_outputs attribute seems to be redundant since we already have
// node_info::num_outputs, but we can still perform an error check
if
(
contains
(
info
.
attributes
,
"num_outputs"
))
{
{
MIGRAPHX_THROW
(
"PARSE_SPLIT: input cannot be equally divided into "
+
num_outputs
=
std
::
to_string
(
info
.
num_outputs
)
+
" splits!"
);
parser
.
parse_value
(
info
.
attributes
.
at
(
"num_outputs"
)).
at
<
std
::
size_t
>
();
if
(
num_outputs
!=
info
.
num_outputs
)
{
MIGRAPHX_THROW
(
"PARSE_SPLIT: num_outputs attribute "
+
std
::
to_string
(
num_outputs
)
+
" doesn't match actual number of outputs "
+
std
::
to_string
(
info
.
num_outputs
)
+
"!"
);
}
}
if
(
lens
[
tuned_axis
]
%
num_outputs
==
0
)
{
std
::
size_t
chunk_size
=
lens
[
tuned_axis
]
/
num_outputs
;
vec_splits
.
resize
(
num_outputs
,
chunk_size
);
}
else
{
std
::
size_t
chunk_size
=
lens
[
tuned_axis
]
/
num_outputs
+
1
;
std
::
size_t
last_chunk_size
=
lens
[
tuned_axis
]
-
chunk_size
*
(
num_outputs
-
1
);
vec_splits
.
resize
(
num_outputs
-
1
,
chunk_size
);
vec_splits
.
push_back
(
last_chunk_size
);
}
}
auto
dl
=
lens
[
tuned_axis
]
/
info
.
num_outputs
;
vec_splits
.
resize
(
info
.
num_outputs
,
dl
);
}
}
if
(
std
::
accumulate
(
vec_splits
.
begin
(),
vec_splits
.
end
(),
int64_t
(
0
))
!=
if
(
std
::
accumulate
(
vec_splits
.
begin
(),
vec_splits
.
end
(),
int64_t
(
0
))
!=
...
...
src/program.cpp
View file @
0662a9a3
...
@@ -936,7 +936,7 @@ void program::perf_report(std::ostream& os,
...
@@ -936,7 +936,7 @@ void program::perf_report(std::ostream& os,
os
<<
std
::
endl
;
os
<<
std
::
endl
;
os
<<
"Batch size: "
<<
batch
<<
std
::
endl
;
os
<<
"Batch size: "
<<
batch
<<
std
::
endl
;
os
<<
"Rate: "
<<
rate
*
batch
<<
"inferences/sec"
<<
std
::
endl
;
os
<<
"Rate: "
<<
rate
*
batch
<<
"
inferences/sec"
<<
std
::
endl
;
os
<<
"Total time: "
<<
total_time
<<
"ms"
<<
std
::
endl
;
os
<<
"Total time: "
<<
total_time
<<
"ms"
<<
std
::
endl
;
os
<<
"Total instructions time: "
<<
total_instruction_time
<<
"ms"
<<
std
::
endl
;
os
<<
"Total instructions time: "
<<
total_instruction_time
<<
"ms"
<<
std
::
endl
;
os
<<
"Overhead time: "
<<
overhead_time
<<
"ms"
os
<<
"Overhead time: "
<<
overhead_time
<<
"ms"
...
...
src/py/migraphx_py.cpp
View file @
0662a9a3
...
@@ -472,7 +472,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
...
@@ -472,7 +472,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
map_dyn_input_dims
,
map_dyn_input_dims
,
bool
skip_unknown_operators
,
bool
skip_unknown_operators
,
bool
print_program_on_error
,
bool
print_program_on_error
,
int64_t
max_loop_iterations
)
{
int64_t
max_loop_iterations
,
int64_t
limit_max_iterations
)
{
migraphx
::
onnx_options
options
;
migraphx
::
onnx_options
options
;
options
.
default_dim_value
=
default_dim_value
;
options
.
default_dim_value
=
default_dim_value
;
options
.
default_dyn_dim_value
=
default_dyn_dim_value
;
options
.
default_dyn_dim_value
=
default_dyn_dim_value
;
...
@@ -481,6 +482,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
...
@@ -481,6 +482,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
options
.
skip_unknown_operators
=
skip_unknown_operators
;
options
.
skip_unknown_operators
=
skip_unknown_operators
;
options
.
print_program_on_error
=
print_program_on_error
;
options
.
print_program_on_error
=
print_program_on_error
;
options
.
max_loop_iterations
=
max_loop_iterations
;
options
.
max_loop_iterations
=
max_loop_iterations
;
options
.
limit_max_iterations
=
limit_max_iterations
;
return
migraphx
::
parse_onnx
(
filename
,
options
);
return
migraphx
::
parse_onnx
(
filename
,
options
);
},
},
"Parse onnx file"
,
"Parse onnx file"
,
...
@@ -492,7 +494,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
...
@@ -492,7 +494,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
migraphx
::
shape
::
dynamic_dimension
>>
(),
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
migraphx
::
shape
::
dynamic_dimension
>>
(),
py
::
arg
(
"skip_unknown_operators"
)
=
false
,
py
::
arg
(
"skip_unknown_operators"
)
=
false
,
py
::
arg
(
"print_program_on_error"
)
=
false
,
py
::
arg
(
"print_program_on_error"
)
=
false
,
py
::
arg
(
"max_loop_iterations"
)
=
10
);
py
::
arg
(
"max_loop_iterations"
)
=
10
,
py
::
arg
(
"limit_max_iterations"
)
=
std
::
numeric_limits
<
uint16_t
>::
max
());
m
.
def
(
m
.
def
(
"parse_onnx_buffer"
,
"parse_onnx_buffer"
,
...
...
src/quantization.cpp
View file @
0662a9a3
...
@@ -147,8 +147,8 @@ void quantize_int8(program& prog,
...
@@ -147,8 +147,8 @@ void quantize_int8(program& prog,
run_passes
(
prog
,
run_passes
(
prog
,
{
quantize_int8_pass
{
ins_names
,
*
int8_quant_params
},
{
quantize_int8_pass
{
ins_names
,
*
int8_quant_params
},
optimize_module
{},
simplify_qdq
{},
simplify_qdq
{},
optimize_module
{},
dead_code_elimination
{}});
dead_code_elimination
{}});
}
}
...
...
src/rewrite_quantization.cpp
View file @
0662a9a3
...
@@ -47,7 +47,7 @@ void apply_quantizelinear(module& m, instruction_ref ins)
...
@@ -47,7 +47,7 @@ void apply_quantizelinear(module& m, instruction_ref ins)
ins
,
make_op
(
"convert"
,
{{
"target_type"
,
y_scale
->
get_shape
().
type
()}}),
x
);
ins
,
make_op
(
"convert"
,
{{
"target_type"
,
y_scale
->
get_shape
().
type
()}}),
x
);
}
}
auto
div
=
m
.
insert_instruction
(
ins
,
make_op
(
"div"
),
x
,
y_scale
);
auto
div
=
m
.
insert_instruction
(
ins
,
make_op
(
"div"
),
x
,
y_scale
);
auto
add_zero_point
=
m
.
insert_instruction
(
ins
,
make_op
(
"
round
"
),
div
);
auto
add_zero_point
=
m
.
insert_instruction
(
ins
,
make_op
(
"
nearbyint
"
),
div
);
if
(
ins
->
inputs
().
size
()
==
3
)
if
(
ins
->
inputs
().
size
()
==
3
)
{
{
...
...
src/simplify_dyn_ops.cpp
View file @
0662a9a3
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#include <migraphx/simplify_dyn_ops.hpp>
#include <migraphx/simplify_dyn_ops.hpp>
#include <migraphx/matcher.hpp>
#include <migraphx/matcher.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/literal.hpp>
namespace
migraphx
{
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
@@ -131,10 +132,53 @@ struct find_const_4in_slice
...
@@ -131,10 +132,53 @@ struct find_const_4in_slice
}
}
};
};
/**
* Simplify dimensions_of to a literal when the input arugment has a static shape
* or the dynamic dimensions from `start` to `end` are fixed.
*/
struct
find_static_dimensions_of
{
auto
matcher
()
const
{
return
match
::
name
(
"dimensions_of"
)();
}
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
input
=
ins
->
inputs
().
at
(
0
);
auto
dimensions_of_value
=
ins
->
get_operator
().
to_value
();
auto
start
=
dimensions_of_value
.
at
(
"start"
).
to
<
std
::
size_t
>
();
auto
end
=
dimensions_of_value
.
at
(
"end"
).
to
<
std
::
size_t
>
();
if
(
input
->
get_shape
().
dynamic
())
{
// check if dynamic dimensions from start to end are fixed
auto
dds
=
input
->
get_shape
().
dyn_dims
();
if
(
std
::
any_of
(
dds
.
begin
()
+
start
,
dds
.
begin
()
+
end
,
[](
auto
dd
)
{
return
not
dd
.
is_fixed
();
}))
{
return
;
}
}
std
::
size_t
output_ndim
=
end
-
start
;
std
::
vector
<
int64_t
>
vec_shape
(
output_ndim
);
migraphx
::
shape
s
(
migraphx
::
shape
::
int64_type
,
{
output_ndim
});
std
::
vector
<
std
::
size_t
>
input_lens
=
input
->
get_shape
().
to_static
(
1
).
lens
();
std
::
transform
(
input_lens
.
begin
()
+
start
,
input_lens
.
begin
()
+
end
,
vec_shape
.
begin
(),
[](
auto
i
)
{
return
int64_t
(
i
);
});
migraphx
::
shape
output_shape
{
migraphx
::
shape
::
int64_type
,
{
end
-
start
}};
auto
lit_ins
=
m
.
add_literal
(
migraphx
::
literal
{
output_shape
,
vec_shape
});
m
.
replace_instruction
(
ins
,
lit_ins
);
}
};
void
simplify_dyn_ops
::
apply
(
module
&
m
)
const
void
simplify_dyn_ops
::
apply
(
module
&
m
)
const
{
{
match
::
find_matches
(
match
::
find_matches
(
m
,
m
,
find_static_2in_broadcasts
{},
find_const_3in_slice
{},
find_const_4in_slice
{});
find_static_2in_broadcasts
{},
find_static_dimensions_of
{},
find_const_3in_slice
{},
find_const_4in_slice
{});
}
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/simplify_reshapes.cpp
View file @
0662a9a3
...
@@ -647,8 +647,8 @@ struct find_broadcast_transpose
...
@@ -647,8 +647,8 @@ struct find_broadcast_transpose
{
{
auto
transpose
=
r
.
result
;
auto
transpose
=
r
.
result
;
auto
transpose_lens
=
transpose
->
get_shape
().
lens
();
auto
transpose_lens
=
transpose
->
get_shape
().
lens
();
auto
bcast_ins
=
r
.
instructions
[
"bcast_ins"
];
auto
bcast_ins
=
r
.
instructions
[
"bcast_ins"
];
auto
input
=
bcast_ins
->
inputs
().
front
();
auto
input
=
bcast_ins
->
inputs
().
front
();
// scalar transformation does not need extra transpose
// scalar transformation does not need extra transpose
if
(
not
input
->
get_shape
().
scalar
())
if
(
not
input
->
get_shape
().
scalar
())
{
{
...
...
src/targets/gpu/CMakeLists.txt
View file @
0662a9a3
# ####################################################################################
# ####################################################################################
# The MIT License (MIT)
# The MIT License (MIT)
#
#
# Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
#
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# of this software and associated documentation files (the "Software"), to deal
...
@@ -37,8 +37,7 @@ if(NOT TARGET MIOpen)
...
@@ -37,8 +37,7 @@ if(NOT TARGET MIOpen)
message
(
SEND_ERROR
"Cant find miopen"
)
message
(
SEND_ERROR
"Cant find miopen"
)
endif
()
endif
()
if
(
NOT WIN32
)
if
(
MIGRAPHX_USE_COMPOSABLEKERNEL
)
# TODO: re-enable when CK is ported to Windows
find_package
(
composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library
)
find_package
(
composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library
)
endif
()
endif
()
...
@@ -52,10 +51,10 @@ file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
...
@@ -52,10 +51,10 @@ file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/*.hpp
)
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/*.hpp
)
message
(
STATUS
"KERNEL_FILES:
${
KERNEL_FILES
}
"
)
message
(
STATUS
"KERNEL_FILES:
${
KERNEL_FILES
}
"
)
if
(
WIN32
)
if
(
NOT MIGRAPHX_USE_COMPOSABLEKERNEL
)
# TODO: re-enable when CK is ported to Windows
list
(
REMOVE_ITEM KERNEL_FILES
list
(
REMOVE_ITEM KERNEL_FILES
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/ck_gemm.hpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/ck_gemm.hpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/ck.hpp
)
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/ck.hpp
)
endif
()
endif
()
...
@@ -103,9 +102,10 @@ rocm_clang_tidy_check(kernel_file_check)
...
@@ -103,9 +102,10 @@ rocm_clang_tidy_check(kernel_file_check)
file
(
GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS
${
CMAKE_CURRENT_SOURCE_DIR
}
/jit/*.cpp
)
file
(
GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS
${
CMAKE_CURRENT_SOURCE_DIR
}
/jit/*.cpp
)
if
(
WIN32
)
if
(
NOT MIGRAPHX_USE_COMPOSABLEKERNEL
)
# TODO: re-enable when CK is ported to Windows
list
(
REMOVE_ITEM JIT_GPU_SRCS
list
(
REMOVE_ITEM JIT_GPU_SRCS
${
CMAKE_CURRENT_SOURCE_DIR
}
/jit/ck_gemm.cpp
)
${
CMAKE_CURRENT_SOURCE_DIR
}
/jit/ck_gemm.cpp
${
CMAKE_CURRENT_SOURCE_DIR
}
/jit/ck_gemm_softmax_gemm.cpp
)
endif
()
endif
()
add_library
(
migraphx_gpu
add_library
(
migraphx_gpu
...
@@ -128,8 +128,6 @@ add_library(migraphx_gpu
...
@@ -128,8 +128,6 @@ add_library(migraphx_gpu
gather.cpp
gather.cpp
gemm_impl.cpp
gemm_impl.cpp
hip.cpp
hip.cpp
int8_conv_pack.cpp
int8_gemm_pack.cpp
kernel.cpp
kernel.cpp
lowering.cpp
lowering.cpp
logsoftmax.cpp
logsoftmax.cpp
...
@@ -140,7 +138,6 @@ add_library(migraphx_gpu
...
@@ -140,7 +138,6 @@ add_library(migraphx_gpu
no_device.cpp
no_device.cpp
nonzero.cpp
nonzero.cpp
pack_args.cpp
pack_args.cpp
pack_int8_args.cpp
prefuse_ops.cpp
prefuse_ops.cpp
pad.cpp
pad.cpp
perfdb.cpp
perfdb.cpp
...
@@ -184,7 +181,6 @@ register_migraphx_gpu_ops(hip_
...
@@ -184,7 +181,6 @@ register_migraphx_gpu_ops(hip_
register_migraphx_gpu_ops
(
miopen_
register_migraphx_gpu_ops
(
miopen_
abs
abs
contiguous
contiguous
int8_conv_pack
lrn
lrn
pooling
pooling
)
)
...
@@ -192,10 +188,6 @@ register_op(migraphx_gpu
...
@@ -192,10 +188,6 @@ register_op(migraphx_gpu
HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
INCLUDES migraphx/gpu/context.hpp
)
INCLUDES migraphx/gpu/context.hpp
)
register_op
(
migraphx_gpu
HEADER migraphx/gpu/int8_gemm_pack.hpp
OPERATORS gpu::hip_int8_gemm_pack_a gpu::hip_int8_gemm_pack_b
INCLUDES migraphx/gpu/context.hpp
)
register_op
(
migraphx_gpu
register_op
(
migraphx_gpu
HEADER migraphx/gpu/gemm.hpp
HEADER migraphx/gpu/gemm.hpp
OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
...
@@ -239,24 +231,28 @@ else()
...
@@ -239,24 +231,28 @@ else()
string
(
REGEX REPLACE
" /[^ ]+
\\
.(a|so) "
" "
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REGEX REPLACE
" /[^ ]+
\\
.(a|so) "
" "
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
endforeach
()
endforeach
()
message
(
STATUS
"Hip compiler flags:
${
HIP_COMPILER_FLAGS
}
"
)
message
(
STATUS
"Hip compiler flags:
\"
${
HIP_COMPILER_FLAGS
}
\"
"
)
target_compile_definitions
(
migraphx_gpu PRIVATE
target_compile_definitions
(
migraphx_gpu PRIVATE
"
-DMIGRAPHX_HIP_COMPILER=
${
CMAKE_CXX_COMPILER
}
"
-DMIGRAPHX_HIP_COMPILER=
"
${
CMAKE_CXX_COMPILER
}
"
"
-DMIGRAPHX_HIP_COMPILER_FLAGS=
${
HIP_COMPILER_FLAGS
}
"
-DMIGRAPHX_HIP_COMPILER_FLAGS=
"
${
HIP_COMPILER_FLAGS
}
"
)
)
if
(
DEFINED CMAKE_CXX_COMPILER_LAUNCHER
)
if
(
DEFINED CMAKE_CXX_COMPILER_LAUNCHER
)
execute_process
(
COMMAND which
${
CMAKE_CXX_COMPILER_LAUNCHER
}
OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER
)
execute_process
(
COMMAND which
${
CMAKE_CXX_COMPILER_LAUNCHER
}
OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER
)
string
(
STRIP
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
string
(
STRIP
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"
-DMIGRAPHX_HIP_COMPILER_LAUNCHER=
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
)
target_compile_definitions
(
migraphx_gpu PRIVATE -DMIGRAPHX_HIP_COMPILER_LAUNCHER=
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
)
endif
()
endif
()
endif
()
endif
()
# Check miopen find mode api
# Check miopen find mode api
include
(
CheckLibraryExists
)
include
(
CheckLibraryExists
)
get_target_property
(
MIOPEN_LOCATION MIOpen LOCATION
)
get_target_property
(
MIOPEN_LOCATION MIOpen LOCATION
)
get_target_property
(
ROCBLAS_LOCATION roc::rocblas LOCATION
)
check_library_exists
(
MIOpen
"miopenHiddenSetConvolutionFindMode"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_MODE_API
)
check_library_exists
(
MIOpen
"miopenHiddenSetConvolutionFindMode"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_MODE_API
)
check_library_exists
(
MIOpen
"miopenFindSolutions"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_2_API
)
check_library_exists
(
MIOpen
"miopenFindSolutions"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_2_API
)
# Beta API for automated GEMM tuning
check_library_exists
(
roc::rocblas
"rocblas_gemm_ex_get_solutions"
"
${
ROCBLAS_LOCATION
}
"
HAS_ROCBLAS_TUNING_BETA_FEATURE_API
)
set
(
MIGRAPHX_USE_FIND_2_API
"
${
HAS_FIND_2_API
}
"
CACHE BOOL
""
)
set
(
MIGRAPHX_USE_FIND_2_API
"
${
HAS_FIND_2_API
}
"
CACHE BOOL
""
)
...
@@ -279,10 +275,16 @@ else()
...
@@ -279,10 +275,16 @@ else()
message
(
STATUS
"MIOpen does not have find mode api"
)
message
(
STATUS
"MIOpen does not have find mode api"
)
endif
()
endif
()
if
(
HAS_ROCBLAS_TUNING_BETA_FEATURE_API
)
target_compile_definitions
(
migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_TUNING_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS
)
message
(
STATUS
"MIGraphx is using Beta API of rocBLAS"
)
else
()
message
(
STATUS
"rocBLAS does not have User Tuning Beta API"
)
endif
()
target_link_libraries
(
migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas
)
target_link_libraries
(
migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas
)
target_link_libraries
(
migraphx_gpu PRIVATE migraphx_device migraphx_kernels
)
target_link_libraries
(
migraphx_gpu PRIVATE migraphx_device migraphx_kernels
)
if
(
NOT WIN32
)
if
(
MIGRAPHX_USE_COMPOSABLEKERNEL
)
# TODO: re-enable when CK is ported to Windows
target_link_libraries
(
migraphx_gpu PRIVATE composable_kernel::jit_library
)
target_link_libraries
(
migraphx_gpu PRIVATE composable_kernel::jit_library
)
endif
()
endif
()
...
...
src/targets/gpu/compile_hip.cpp
View file @
0662a9a3
...
@@ -284,16 +284,20 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
...
@@ -284,16 +284,20 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
bool
is_hip_clang_compiler
()
bool
is_hip_clang_compiler
()
{
{
static
const
auto
result
=
ends_with
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
),
"clang++"
)
;
static
const
auto
result
=
fs
::
path
{
MIGRAPHX_HIP_COMPILER
}.
stem
()
==
"clang++"
;
return
result
;
return
result
;
}
}
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
bool
has_compiler_launcher
()
bool
has_compiler_launcher
()
{
{
static
const
auto
result
=
fs
::
exists
(
MIGRAPHX_
STRINGIZE
(
MIGRAPHX_
HIP_COMPILER_LAUNCHER
)
)
;
static
const
auto
result
=
fs
::
exists
(
MIGRAPHX_HIP_COMPILER_LAUNCHER
);
return
result
;
return
result
;
}
}
#endif
src_compiler
assemble
(
src_compiler
compiler
)
src_compiler
assemble
(
src_compiler
compiler
)
{
{
compiler
.
out_ext
=
".S"
;
compiler
.
out_ext
=
".S"
;
...
@@ -306,8 +310,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
...
@@ -306,8 +310,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
{
{
assert
(
not
srcs
.
empty
());
assert
(
not
srcs
.
empty
());
if
(
not
is_hip_clang_compiler
())
if
(
not
is_hip_clang_compiler
())
MIGRAPHX_THROW
(
"Unknown hip compiler: "
+
MIGRAPHX_THROW
(
"Unknown hip compiler: "
MIGRAPHX_HIP_COMPILER
);
std
::
string
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
)));
if
(
params
.
find
(
"-std="
)
==
std
::
string
::
npos
)
if
(
params
.
find
(
"-std="
)
==
std
::
string
::
npos
)
params
+=
" --std=c++17"
;
params
+=
" --std=c++17"
;
...
@@ -323,14 +326,14 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
...
@@ -323,14 +326,14 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
params
+=
" -DMIGRAPHX_DEBUG"
;
params
+=
" -DMIGRAPHX_DEBUG"
;
params
+=
" -Wno-unused-command-line-argument -Wno-cuda-compat "
;
params
+=
" -Wno-unused-command-line-argument -Wno-cuda-compat "
;
params
+=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER_FLAGS
)
;
params
+=
MIGRAPHX_HIP_COMPILER_FLAGS
;
src_compiler
compiler
;
src_compiler
compiler
;
compiler
.
flags
=
params
;
compiler
.
flags
=
params
;
compiler
.
compiler
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
)
;
compiler
.
compiler
=
MIGRAPHX_HIP_COMPILER
;
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
if
(
has_compiler_launcher
())
if
(
has_compiler_launcher
())
compiler
.
launcher
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
;
compiler
.
launcher
=
MIGRAPHX_HIP_COMPILER_LAUNCHER
;
#endif
#endif
if
(
enabled
(
MIGRAPHX_GPU_DUMP_SRC
{}))
if
(
enabled
(
MIGRAPHX_GPU_DUMP_SRC
{}))
{
{
...
@@ -354,7 +357,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
...
@@ -354,7 +357,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
bool
hip_has_flags
(
const
std
::
vector
<
std
::
string
>&
flags
)
bool
hip_has_flags
(
const
std
::
vector
<
std
::
string
>&
flags
)
{
{
src_compiler
compiler
;
src_compiler
compiler
;
compiler
.
compiler
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
)
;
compiler
.
compiler
=
MIGRAPHX_HIP_COMPILER
;
compiler
.
flags
=
compiler
.
flags
=
join_strings
(
flags
,
" "
)
+
" -x hip -c --offload-arch=gfx900 --cuda-device-only"
;
join_strings
(
flags
,
" "
)
+
" -x hip -c --offload-arch=gfx900 --cuda-device-only"
;
...
...
src/targets/gpu/compile_miopen.cpp
View file @
0662a9a3
...
@@ -60,9 +60,8 @@ struct miopen_op
...
@@ -60,9 +60,8 @@ struct miopen_op
};
};
MIGRAPHX_REGISTER_OP
(
miopen_op
);
MIGRAPHX_REGISTER_OP
(
miopen_op
);
std
::
size_t
compile_miopen
::
compile
(
operation
&
op
,
instruction_ref
ins
,
bool
format
)
const
std
::
size_t
compile_miopen
::
compile
(
operation
&
op
,
instruction_ref
ins
)
const
{
{
op
.
from_value
({{
"int8_x4_format"
,
format
}});
auto
v
=
op
.
compile
(
*
ctx
,
ins
->
get_shape
(),
to_shapes
(
ins
->
inputs
()));
auto
v
=
op
.
compile
(
*
ctx
,
ins
->
get_shape
(),
to_shapes
(
ins
->
inputs
()));
return
v
.
get
<
std
::
size_t
>
(
"workspace"
,
0
);
return
v
.
get
<
std
::
size_t
>
(
"workspace"
,
0
);
}
}
...
@@ -70,25 +69,15 @@ std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool for
...
@@ -70,25 +69,15 @@ std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool for
void
compile_miopen
::
apply
(
module
&
m
)
const
void
compile_miopen
::
apply
(
module
&
m
)
const
{
{
assert
(
ctx
);
assert
(
ctx
);
const
bool
int8_x4_format
=
get_int8_x4_format
(
any_cast
<
migraphx
::
gpu
::
context
>
(
*
ctx
));
for
(
auto
ins
:
iterator_for
(
m
))
for
(
auto
ins
:
iterator_for
(
m
))
{
{
if
(
ins
->
name
()
!=
"gpu::miopen_op"
)
if
(
ins
->
name
()
!=
"gpu::miopen_op"
)
continue
;
continue
;
auto
op
=
any_cast
<
miopen_op
>
(
ins
->
get_operator
()).
op
;
auto
op
=
any_cast
<
miopen_op
>
(
ins
->
get_operator
()).
op
;
std
::
size_t
ws
=
0
;
std
::
size_t
ws
=
0
;
try
ws
=
compile
(
op
,
ins
);
{
auto
inputs
=
ins
->
inputs
();
// for the regular convolution and convolution_backwards, this try would always succeed
auto
alloc
=
m
.
insert_instruction
(
ws
=
compile
(
op
,
ins
,
int8_x4_format
);
}
catch
(
migraphx
::
exception
&
)
{
// In case no solver supports the default format, retry using the other format.
ws
=
compile
(
op
,
ins
,
not
int8_x4_format
);
}
auto
inputs
=
ins
->
inputs
();
auto
alloc
=
m
.
insert_instruction
(
ins
,
make_op
(
"allocate"
,
{{
"shape"
,
to_value
(
shape
{
shape
::
int8_type
,
{
ws
}})}}));
ins
,
make_op
(
"allocate"
,
{{
"shape"
,
to_value
(
shape
{
shape
::
int8_type
,
{
ws
}})}}));
inputs
.
insert
(
std
::
prev
(
inputs
.
end
()),
alloc
);
inputs
.
insert
(
std
::
prev
(
inputs
.
end
()),
alloc
);
...
...
src/targets/gpu/compile_ops.cpp
View file @
0662a9a3
...
@@ -168,6 +168,7 @@ struct compile_plan
...
@@ -168,6 +168,7 @@ struct compile_plan
}
}
const
compiled_result
&
benchmark
(
problem_cache
&
pc
)
const
const
compiled_result
&
benchmark
(
problem_cache
&
pc
)
const
{
{
const
auto
trace_level
=
value_of
(
MIGRAPHX_TRACE_BENCHMARKING
{});
if
(
results
.
empty
())
if
(
results
.
empty
())
MIGRAPHX_THROW
(
"No configs to tune"
);
MIGRAPHX_THROW
(
"No configs to tune"
);
if
(
results
.
size
()
==
1
)
if
(
results
.
size
()
==
1
)
...
@@ -178,9 +179,10 @@ struct compile_plan
...
@@ -178,9 +179,10 @@ struct compile_plan
}
}
if
(
not
config
)
if
(
not
config
)
MIGRAPHX_THROW
(
"Multiple kernels without config"
);
MIGRAPHX_THROW
(
"Multiple kernels without config"
);
std
::
cout
<<
"Benchmarking "
<<
preop
.
name
()
<<
": "
<<
results
.
size
()
<<
" configs"
if
(
trace_level
>
0
)
<<
std
::
endl
;
std
::
cout
<<
"Benchmarking "
<<
preop
.
name
()
<<
": "
<<
results
.
size
()
<<
" configs"
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{}))
<<
std
::
endl
;
if
(
trace_level
>
1
)
std
::
cout
<<
"Problem: "
<<
config
->
problem
<<
std
::
endl
;
std
::
cout
<<
"Problem: "
<<
config
->
problem
<<
std
::
endl
;
std
::
vector
<
double
>
times
;
std
::
vector
<
double
>
times
;
times
.
reserve
(
results
.
size
());
times
.
reserve
(
results
.
size
());
...
@@ -189,22 +191,23 @@ struct compile_plan
...
@@ -189,22 +191,23 @@ struct compile_plan
config
->
solutions
.
begin
(),
config
->
solutions
.
begin
(),
std
::
back_inserter
(
times
),
std
::
back_inserter
(
times
),
[
&
](
const
auto
&
cr
,
const
auto
&
solution
)
{
[
&
](
const
auto
&
cr
,
const
auto
&
solution
)
{
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{})
)
if
(
trace_level
>
1
)
std
::
cout
<<
"Benchmarking solution: "
<<
solution
<<
std
::
endl
;
std
::
cout
<<
"Benchmarking solution: "
<<
solution
<<
std
::
endl
;
if
(
not
cr
.
has_value
())
if
(
not
cr
.
has_value
())
{
{
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{})
)
if
(
trace_level
>
1
)
std
::
cout
<<
"No binary"
<<
std
::
endl
;
std
::
cout
<<
"No binary"
<<
std
::
endl
;
return
std
::
numeric_limits
<
double
>::
max
();
return
std
::
numeric_limits
<
double
>::
max
();
}
}
auto
t
=
time_op
(
auto
t
=
time_op
(
*
ctx
,
cr
->
replace
.
code_object
,
to_shapes
(
cr
->
ins
->
inputs
()),
20
);
*
ctx
,
cr
->
replace
.
code_object
,
to_shapes
(
cr
->
ins
->
inputs
()),
20
);
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{})
)
if
(
trace_level
>
1
)
std
::
cout
<<
t
<<
"ms"
<<
std
::
endl
;
std
::
cout
<<
t
<<
"ms"
<<
std
::
endl
;
return
t
;
return
t
;
});
});
auto
i
=
std
::
distance
(
times
.
begin
(),
std
::
min_element
(
times
.
begin
(),
times
.
end
()));
auto
i
=
std
::
distance
(
times
.
begin
(),
std
::
min_element
(
times
.
begin
(),
times
.
end
()));
std
::
cout
<<
"Fastest solution: "
<<
config
->
solutions
.
at
(
i
)
<<
std
::
endl
;
if
(
trace_level
>
0
)
std
::
cout
<<
"Fastest solution: "
<<
config
->
solutions
.
at
(
i
)
<<
std
::
endl
;
pc
.
insert
(
preop
.
name
(),
config
->
problem
,
config
->
solutions
.
at
(
i
));
pc
.
insert
(
preop
.
name
(),
config
->
problem
,
config
->
solutions
.
at
(
i
));
if
(
not
results
[
i
].
has_value
())
if
(
not
results
[
i
].
has_value
())
MIGRAPHX_THROW
(
"No valid tuned compilation."
);
MIGRAPHX_THROW
(
"No valid tuned compilation."
);
...
...
src/targets/gpu/device/int8_gemm_pack.cpp
deleted
100644 → 0
View file @
b74d3a8f
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/int8_gemm_pack.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/tensor.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
int8_gemm_pack_a
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
comp_shape
=
arg
.
get_shape
();
auto
out_lens
=
comp_shape
.
lens
();
auto
dim_0
=
out_lens
.
size
()
-
2
;
auto
dim_1
=
out_lens
.
size
()
-
1
;
std
::
size_t
lda
=
comp_shape
.
strides
()[
dim_0
];
std
::
size_t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
visit_all
(
result
,
arg
)([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
comp_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
out_lens
.
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
comp_shape
);
gs_launch
(
stream
,
nelements
,
256
)([
=
](
auto
ii
)
__device__
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_m
=
idx
[
dim_1
];
std
::
size_t
i_k
=
idx
[
dim_0
];
std
::
size_t
offset
=
ii
/
m_size
*
m_size
;
out_ptr
[
i_k
%
nb
+
(
i_m
+
(
i_k
/
nb
)
*
lda
)
*
nb
+
offset
]
=
in_ptr
[
i_m
+
i_k
*
lda
+
offset
];
});
});
});
}
void
int8_gemm_pack_b
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
)
{
auto
trans_shape
=
arg
.
get_shape
();
auto
out_lens
=
trans_shape
.
lens
();
auto
dim_0
=
trans_shape
.
lens
().
size
()
-
2
;
auto
dim_1
=
trans_shape
.
lens
().
size
()
-
1
;
std
::
size_t
ldb
=
trans_shape
.
strides
()[
dim_1
];
auto
wrap_lens
=
out_lens
;
std
::
swap
(
wrap_lens
[
dim_0
],
wrap_lens
[
dim_1
]);
shape
comp_shape
{
trans_shape
.
type
(),
wrap_lens
};
std
::
size_t
m_size
=
out_lens
[
dim_0
]
*
out_lens
[
dim_1
];
visit_all
(
result
,
arg
)([
&
](
auto
output
,
auto
input
)
{
std
::
size_t
nelements
=
comp_shape
.
elements
();
auto
*
out_ptr
=
device_cast
(
output
.
data
());
auto
*
in_ptr
=
device_cast
(
input
.
data
());
visit_tensor_size
(
out_lens
.
size
(),
[
&
](
auto
out_dim
)
{
hip_tensor_descriptor
<
out_dim
>
desc
(
comp_shape
);
gs_launch
(
stream
,
nelements
,
256
)([
=
](
auto
ii
)
__device__
{
const
size_t
nb
=
4
;
auto
idx
=
desc
.
multi
(
ii
);
std
::
size_t
i_n
=
idx
[
dim_1
];
std
::
size_t
i_k
=
idx
[
dim_0
];
std
::
size_t
offset
=
ii
/
m_size
*
m_size
;
out_ptr
[
i_k
%
nb
+
(
i_n
+
(
i_k
/
nb
)
*
ldb
)
*
nb
+
offset
]
=
in_ptr
[
i_n
+
i_k
*
ldb
+
offset
];
});
});
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/gemm_impl.cpp
View file @
0662a9a3
/*
/*
* The MIT License (MIT)
* The MIT License (MIT)
*
*
* Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* of this software and associated documentation files (the "Software"), to deal
...
@@ -21,15 +21,20 @@
...
@@ -21,15 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
* THE SOFTWARE.
*/
*/
#include <rocblas/rocblas.h>
#include <rocblas/rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/time.hpp>
using
microseconds
=
std
::
chrono
::
duration
<
double
,
std
::
micro
>
;
namespace
migraphx
{
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
gpu
{
// Convert rocBLAS datatypes to equivalent Migraphx data types
rocblas_datatype
get_type
(
shape
::
type_t
type
)
rocblas_datatype
get_type
(
shape
::
type_t
type
)
{
{
switch
(
type
)
switch
(
type
)
...
@@ -81,196 +86,508 @@ shape transpose_batch(const shape& s, unsigned trans_batch)
...
@@ -81,196 +86,508 @@ shape transpose_batch(const shape& s, unsigned trans_batch)
return
shape
::
from_permutation
(
s
.
type
(),
s
.
lens
(),
perm
);
return
shape
::
from_permutation
(
s
.
type
(),
s
.
lens
(),
perm
);
}
}
template
<
class
R
,
class
...
Ts
,
class
...
Us
>
/**
R
rocblas_invoke
(
R
(
*
f
)(
Ts
...),
Us
...
xs
)
* Returns results of rocblas_status_success, rocblas_status_perf_degraded,
* or rocblas_status_invalid_value. Caller
* is expected to check for invalid index. Any other result causes an exception.
*
*/
template
<
class
F
,
class
Pack
,
class
...
Ts
>
auto
rocblas_invoke
(
F
f
,
Pack
p
,
Ts
...
xs
)
{
{
if
constexpr
(
sizeof
...(
Ts
)
==
sizeof
...(
Us
))
return
p
([
=
](
auto
...
ws
)
{
return
f
(
xs
...);
auto
status
=
f
(
ws
...,
xs
...);
else
if
(
status
!=
rocblas_status_success
and
status
!=
rocblas_status_invalid_value
)
return
f
(
xs
...,
nullptr
,
nullptr
);
{
if
(
status
==
rocblas_status_perf_degraded
)
{
std
::
cerr
<<
"WARNING: degraded perf. in rocBLAS call"
<<
std
::
endl
;
}
else
MIGRAPHX_THROW
(
"rocblas_invoke: rocBLAS call failed with status "
+
std
::
to_string
(
status
));
}
return
status
;
});
}
}
static
bool
is_transposed
(
const
shape
&
s
)
static
bool
is_transposed
(
const
shape
&
s
)
{
return
s
.
transposed
()
and
s
.
strides
().
back
()
!=
1
;
}
{
if
(
not
s
.
transposed
())
return
false
;
return
s
.
strides
().
back
()
!=
1
;
}
static
rocblas_int
get_batch_stride
(
const
argument
&
a
)
static
rocblas_int
get_batch_stride
(
const
shape
&
s
)
{
{
return
a
.
get_shape
().
strides
()[
a
.
get_shape
().
strides
().
size
()
-
3
];
// This value is not needed for non-strided inputs
if
(
s
.
strides
().
size
()
<
3
)
return
0
;
else
return
s
.
strides
()[
s
.
strides
().
size
()
-
3
];
}
}
template
<
class
T
>
/**
void
gemm_impl
(
context
&
ctx
,
* Wrapper for multiple rocBLAS calls. The constructor creates parameters for
const
shape
&
output_shape
,
* these calls based on data shapes and other values contained in the associated
const
std
::
vector
<
argument
>&
args
,
* instruction and operation.
T
alpha
,
*
T
beta
,
* The template parameter T is not the type of the matrix data but of the weighting
bool
int8_x4_format
,
* coefficients alpha and beta (these are float in rocBLAS internals)
bool
compute_fp32
)
*/
template
<
typename
T
>
struct
gemm_impl
{
{
const
bool
is_3inputs
=
(
args
.
size
()
==
4
);
gemm_impl
(
const
shape
&
output_shape
,
if
(
not
is_3inputs
)
const
std
::
vector
<
shape
>&
input_shapes
,
T
alpha_param
,
T
beta_param
,
bool
compute_fp32_flag
)
:
alpha
(
alpha_param
),
beta
(
beta_param
),
is_3inputs
(
input_shapes
.
size
()
==
4
),
compute_fp32
(
compute_fp32_flag
)
{
{
beta
=
0
;
if
(
not
is_3inputs
)
}
{
beta
=
0
;
bool
transa
=
is_transposed
(
args
[
0
].
get_shape
());
}
bool
transb
=
is_transposed
(
args
[
1
].
get_shape
());
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldc
=
args
[
2
].
get_shape
().
strides
()[
dim_0
];
rocblas_int
ldd
=
is_3inputs
?
args
[
3
].
get_shape
().
strides
()[
dim_0
]
:
ldc
;
rocblas_datatype
arg_type
=
get_type
(
args
[
0
].
get_shape
().
type
());
auto
output_type
=
arg_type
;
if
(
output_type
==
rocblas_datatype_i8_r
)
{
output_type
=
rocblas_datatype_i32_r
;
}
auto
compute_type
=
output_type
;
if
(
compute_fp32
)
{
if
(
arg_type
==
rocblas_datatype_f16_r
)
compute_type
=
rocblas_datatype_f32_r
;
}
rocblas_gemm_flags
flag
=
rocblas_gemm_flags_none
;
#if ROCBLAS_VERSION_MAJOR < 3
if
(
int8_x4_format
)
flag
=
rocblas_gemm_flags_pack_int8x4
;
#endif
auto
a_lens
=
args
[
0
].
get_shape
().
lens
();
// Create lambdas that will cast alpha, beta to the output shape's type
auto
b_lens
=
args
[
1
].
get_shape
().
lens
();
// and retain the values being pointed to
output_shape
.
visit_type
([
&
](
auto
as
)
{
output_shape
.
visit_type
([
&
](
auto
as
)
{
auto
alpha_r
=
as
(
alpha
);
auto
alpha_r
=
as
(
alpha
);
auto
beta_r
=
as
(
beta
);
auto
beta_r
=
as
(
beta
);
if
(
compute_fp32
)
{
get_alpha
=
[
=
]
{
return
&
alpha
;
};
get_beta
=
[
=
]
{
return
&
beta
;
};
}
else
{
get_alpha
=
[
=
]
{
return
&
alpha_r
;
};
get_beta
=
[
=
]
{
return
&
beta_r
;
};
}
});
// use void pointer to select different data type if using fp32 mode
transa
=
is_transposed
(
input_shapes
[
0
]);
void
*
alpha_v
=
&
alpha_r
;
transb
=
is_transposed
(
input_shapes
[
1
]);
void
*
beta_v
=
&
beta_r
;
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_0
=
n_dim
-
2
;
auto
dim_1
=
n_dim
-
1
;
// Leading dimensions of matrices
lda
=
input_shapes
[
0
].
strides
()[
transa
?
dim_1
:
dim_0
];
ldb
=
input_shapes
[
1
].
strides
()[
transb
?
dim_1
:
dim_0
];
ldc
=
input_shapes
[
2
].
strides
()[
dim_0
];
ldd
=
is_3inputs
?
input_shapes
[
3
].
strides
()[
dim_0
]
:
ldc
;
if
(
compute_fp32
)
arg_type
=
get_type
(
input_shapes
[
0
].
type
());
output_type
=
arg_type
;
if
(
output_type
==
rocblas_datatype_i8_r
)
{
{
alpha_v
=
&
alpha
;
output_type
=
rocblas_datatype_i32_r
;
beta_v
=
&
beta
;
}
}
compute_type
=
output_type
;
auto
out_lens
=
output_shape
.
lens
();
if
(
compute_fp32
)
rocblas_int
m
=
out_lens
[
dim_0
];
rocblas_int
n
=
out_lens
[
dim_1
];
rocblas_int
k
=
args
[
0
].
get_shape
().
lens
()[
dim_1
];
auto
to_pointer
=
[
&
](
auto
&&
arg
)
{
return
as
.
from
(
arg
.
data
());
};
if
(
args
[
0
].
get_shape
().
type
()
==
shape
::
int8_type
and
(
k
%
4
)
!=
0
and
int8_x4_format
)
{
{
MIGRAPHX_THROW
(
"ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!"
);
if
(
arg_type
==
rocblas_datatype_f16_r
)
compute_type
=
rocblas_datatype_f32_r
;
}
}
auto
num_matrices
=
std
::
accumulate
(
auto
a_lens
=
input_shapes
[
0
].
lens
();
auto
b_lens
=
input_shapes
[
1
].
lens
();
auto
out_lens
=
output_shape
.
lens
();
m
=
out_lens
[
dim_0
];
n
=
out_lens
[
dim_1
];
k
=
input_shapes
[
0
].
lens
()[
dim_1
];
a_stride
=
get_batch_stride
(
input_shapes
[
0
]);
b_stride
=
get_batch_stride
(
input_shapes
[
1
]);
c_stride
=
get_batch_stride
(
input_shapes
[
2
]);
d_stride
=
is_3inputs
?
get_batch_stride
(
input_shapes
[
3
])
:
c_stride
;
num_matrices
=
std
::
accumulate
(
out_lens
.
rbegin
()
+
2
,
out_lens
.
rend
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
());
out_lens
.
rbegin
()
+
2
,
out_lens
.
rend
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
());
if
(
num_matrices
==
1
or
(
num_matrices
>
1
and
get_batch_stride
(
args
[
1
])
==
0
))
strided_batched
=
num_matrices
>
1
;
if
(
strided_batched
and
b_stride
==
0
and
input_shapes
[
0
].
standard
())
{
{
// If the batch dimension of B is broadcasted, then we can
// If the batch dimension of B is broadcasted, then we can
// multiply m by the batch_size and use rocblas_gemm_ex
// multiply m by the batch_size and use rocblas_gemm_ex
// instead of rocblas_gemm_strided_batched_ex.
// instead of rocblas_gemm_strided_batched_ex.
m
*=
num_matrices
;
m
*=
num_matrices
;
strided_batched
=
false
;
}
}
// the rocblas_gemm API handles inputs and output matrices as
void
run
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
input_args
,
int32_t
solution_idx
=
0
)
const
// column-major format. When doing a C = A * B, we actually do
{
// C^T = (B^T) * (A^T). That is the reason we input args[1] as
if
(
strided_batched
)
// A and args[0] as B in calling the rocblas_gemm.
{
auto
common_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
common_args
,
rocblas_gemm_algo_solution_index
,
solution_idx
,
gemm_flags
);
}
else
{
auto
common_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_ex
,
rocblas_invoke
(
&
rocblas_gemm_ex
,
ctx
.
get_stream
().
get_rocblas
(),
common_args
,
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
rocblas_gemm_algo_solution_index
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
solution_idx
,
n
,
gemm_flags
);
m
,
}
k
,
}
alpha_v
,
to_pointer
(
args
.
at
(
1
)),
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
arg_type
,
auto
validate
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
input_shapes
,
int32_t
solution_idx
)
const
ldb
,
{
to_pointer
(
args
.
at
(
0
)),
// Create dummy arguments for the shapes, and call the overloaded method
arg_type
,
std
::
vector
<
argument
>
input_args
;
lda
,
std
::
transform
(
input_shapes
.
begin
(),
beta_v
,
input_shapes
.
end
(),
to_pointer
(
args
[
2
]),
std
::
back_inserter
(
input_args
),
output_type
,
[](
const
shape
&
x
)
{
return
to_gpu
(
generate_argument
(
x
));
});
ldc
,
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
]),
return
validate
(
ctx
,
input_args
,
solution_idx
);
output_type
,
}
ldd
,
compute_type
,
/**
rocblas_gemm_algo_standard
,
* Checks a particular solution for validity by running it with the flag
0
,
* rocblas_gemm_flags_check_solution_index (could be invalid if this model was
flag
);
* tuned with a different rocBLAS version)
*
* @return Returns either solution_idx if valid, or else the default value 0
* if not. The default does not mean list index 0, but tells the picker
* to choose a solution.
*/
int32_t
validate
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
input_args
,
int32_t
solution_idx
)
const
{
rocblas_status_
check_valid
(
rocblas_status_success
);
if
(
strided_batched
)
{
auto
common_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
check_valid
=
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
common_args
,
rocblas_gemm_algo_solution_index
,
solution_idx
,
rocblas_gemm_flags_check_solution_index
);
}
}
else
else
{
{
auto
a_stride
=
get_batch_stride
(
args
[
0
]);
auto
common_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
auto
b_stride
=
get_batch_stride
(
args
[
1
]);
check_valid
=
rocblas_invoke
(
&
rocblas_gemm_ex
,
auto
c_stride
=
get_batch_stride
(
args
[
2
]);
common_args
,
auto
d_stride
=
is_3inputs
?
get_batch_stride
(
args
[
3
])
:
c_stride
;
rocblas_gemm_algo_solution_index
,
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
solution_idx
,
ctx
.
get_stream
().
get_rocblas
(),
rocblas_gemm_flags_check_solution_index
);
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
m
,
k
,
alpha_v
,
to_pointer
(
args
.
at
(
1
)),
arg_type
,
ldb
,
b_stride
,
to_pointer
(
args
.
at
(
0
)),
arg_type
,
lda
,
a_stride
,
beta_v
,
to_pointer
(
args
[
2
]),
output_type
,
ldc
,
c_stride
,
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
]),
output_type
,
ldd
,
d_stride
,
num_matrices
,
compute_type
,
rocblas_gemm_algo_standard
,
0
,
flag
);
}
}
});
if
(
check_valid
==
rocblas_status_invalid_value
)
{
std
::
cerr
<<
"WARNING: tuned solution is invalid; reverting to default"
<<
std
::
endl
;
return
0
;
}
return
solution_idx
;
}
#endif
/**
* Helper method to create that subset of a long rocBLAS argument list that is common
* to multiple "...strided_batched..." calls.
*
* The rocblas_gemm API handles inputs and output matrices as
* column-major format. When doing a C = A * B, we actually do
* C^T = (B^T) * (A^T). That is the reason we input args[1] as
* A and args[0] as B in calling the rocblas_gemm.
*
*/
auto
create_strided_batched_args_common
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
args
)
const
{
return
pack
(
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
m
,
k
,
get_alpha
(),
args
[
1
].
data
(),
arg_type
,
ldb
,
b_stride
,
args
[
0
].
data
(),
arg_type
,
lda
,
a_stride
,
get_beta
(),
args
[
2
].
data
(),
output_type
,
ldc
,
c_stride
,
is_3inputs
?
args
[
3
].
data
()
:
args
[
2
].
data
(),
output_type
,
ldd
,
d_stride
,
num_matrices
,
compute_type
);
}
/**
* Helper method to create that subset of a long rocBLAS argument list that is common
* to multiple "gemm_ex..." calls.
*
* The rocblas_gemm API handles inputs and output matrices as
* column-major format. When doing a C = A * B, we actually do
* C^T = (B^T) * (A^T). That is the reason we input args[1] as
* A and args[0] as B in calling the rocblas_gemm.
*
* */
auto
create_gemm_ex_args_common
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
args
)
const
{
return
pack
(
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
m
,
k
,
get_alpha
(),
args
[
1
].
data
(),
arg_type
,
ldb
,
args
[
0
].
data
(),
arg_type
,
lda
,
get_beta
(),
args
[
2
].
data
(),
output_type
,
ldc
,
is_3inputs
?
args
[
3
].
data
()
:
args
[
2
].
data
(),
output_type
,
ldd
,
compute_type
);
}
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
/**
* Find best rocBLAS solution: Get list of solutions and try them all, returning the index
* of the fastest one.
*/
int
tune
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
input_shapes
)
const
{
// tuning meta parameters
const
int
hot_calls
=
40
;
std
::
vector
<
argument
>
input_args
;
std
::
transform
(
input_shapes
.
begin
(),
input_shapes
.
end
(),
std
::
back_inserter
(
input_args
),
[](
const
shape
&
x
)
{
return
to_gpu
(
generate_argument
(
x
));
});
// Get the solutions list in 2 rocBLAS steps:
// 1. Find out how many solutions there are and allocate the array
// 2. Get the solutions
//
rocblas_int
list_size
=
0
;
std
::
vector
<
rocblas_int
>
solution_indices
;
if
(
strided_batched
)
{
auto
common_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex_get_solutions
,
common_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
nullptr
,
&
list_size
);
solution_indices
.
resize
(
list_size
);
auto
common_sol_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex_get_solutions
,
common_sol_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
solution_indices
.
data
(),
&
list_size
);
}
else
{
auto
common_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_ex_get_solutions
,
common_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
nullptr
,
&
list_size
);
solution_indices
.
resize
(
list_size
);
auto
common_sol_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_ex_get_solutions
,
common_sol_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
solution_indices
.
data
(),
&
list_size
);
}
double
best_time
=
std
::
numeric_limits
<
double
>::
max
();
double
first_time
=
-
1
;
// Initialize to default solution index
rocblas_int
best_sol
=
0
;
for
(
auto
sol
:
solution_indices
)
{
// Warmup: the first call to an op. may not be representative since there is
// more time taken initializing caches, etc. so we won't time it.
run
(
ctx
,
input_args
,
sol
);
double
host_time
=
time
<
milliseconds
>
([
&
]
{
for
([[
maybe_unused
]]
int
hc
:
range
(
hot_calls
))
run
(
ctx
,
input_args
,
sol
);
ctx
.
finish
();
});
host_time
/=
hot_calls
;
// dev/evaluation only: track time for first solution.
if
(
first_time
<
0
)
first_time
=
host_time
;
// track current best
if
(
host_time
<
best_time
)
{
best_sol
=
sol
;
best_time
=
host_time
;
}
}
std
::
cout
<<
"Winning GEMM solution: "
<<
best_sol
<<
" in "
<<
best_time
<<
" ms, beats "
<<
first_time
<<
"ms"
<<
std
::
endl
;
return
best_sol
;
}
#endif
private:
size_t
num_matrices
=
0
;
rocblas_int
m
=
0
;
rocblas_int
n
=
0
;
rocblas_int
k
=
0
;
bool
transa
=
false
;
bool
transb
=
false
;
T
alpha
=
0
;
T
beta
=
0
;
std
::
function
<
const
void
*
()
>
get_alpha
{};
std
::
function
<
const
void
*
()
>
get_beta
{};
rocblas_gemm_flags
gemm_flags
=
rocblas_gemm_flags_none
;
rocblas_int
lda
=
0
;
rocblas_int
ldb
=
0
;
rocblas_int
ldc
=
0
;
rocblas_int
ldd
=
0
;
rocblas_int
a_stride
=
0
;
rocblas_int
b_stride
=
0
;
rocblas_int
c_stride
=
0
;
rocblas_int
d_stride
=
0
;
rocblas_datatype
compute_type
=
rocblas_datatype_f32_r
;
rocblas_datatype
arg_type
=
rocblas_datatype_f32_r
;
rocblas_datatype
output_type
=
rocblas_datatype_f32_r
;
bool
strided_batched
=
true
;
bool
is_3inputs
=
true
;
bool
compute_fp32
=
true
;
};
// gemm_impl
void
gemm_compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
float
alpha
,
float
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
std
::
vector
<
shape
>
input_shapes
;
std
::
transform
(
args
.
begin
(),
args
.
end
(),
std
::
back_inserter
(
input_shapes
),
[](
const
argument
&
x
)
{
return
x
.
get_shape
();
});
auto
gemm_item
=
gemm_impl
<
float
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
gemm_item
.
run
(
ctx
,
args
,
solution_idx
);
}
void
gemm_compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
std
::
vector
<
shape
>
input_shapes
;
std
::
transform
(
args
.
begin
(),
args
.
end
(),
std
::
back_inserter
(
input_shapes
),
[](
const
argument
&
x
)
{
return
x
.
get_shape
();
});
auto
gemm_item
=
gemm_impl
<
int32_t
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
gemm_item
.
run
(
ctx
,
args
,
solution_idx
);
}
}
void
gemm
(
context
&
ctx
,
/**
const
shape
&
output_shape
,
* Decides if the tune() or validate() method is appropriate and calls it.
const
std
::
vector
<
argument
>&
args
,
* Return value is the chosen solution index, or 0 to let picker choose it.
float
alpha
,
*/
float
beta
,
int32_t
gemm_finalize
(
context
&
ctx
,
bool
int8_x4_format
,
const
shape
&
output_shape
,
bool
compute_fp32
)
const
std
::
vector
<
shape
>&
input_shapes
,
float
alpha
,
float
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
{
gemm_impl
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
int8_x4_format
,
compute_fp32
);
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
// This code should be called only if either the environment var.
// MIGRAPHX_ENABLE_GEMM_TUNING, or option --exhaustive-tune, is set
if
(
solution_idx
==
0
)
{
auto
gemm_item
=
gemm_impl
<
float
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
tune
(
ctx
,
input_shapes
);
}
else
{
// If a tuned solution index is already given, don't tune again but validate
// in case the data was tuned with a different rocBLAS version
auto
gemm_item
=
gemm_impl
<
float
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
validate
(
ctx
,
input_shapes
,
solution_idx
);
}
#else
(
void
)
ctx
,
(
void
)
output_shape
,
(
void
)
input_shapes
;
(
void
)
alpha
,
(
void
)
beta
,
(
void
)
compute_fp32
;
#endif
return
solution_idx
;
}
}
void
gemm
(
context
&
ctx
,
/**
const
shape
&
output_shape
,
* Decides if the tune() or validate() method is appropriate and calls it.
const
std
::
vector
<
argument
>&
args
,
* Return value is the chosen solution index, or 0 to let picker choose it.
int32_t
alpha
,
*/
int32_t
beta
,
int32_t
gemm_finalize
(
context
&
ctx
,
bool
int8_x4_format
,
const
shape
&
output_shape
,
bool
compute_fp32
)
const
std
::
vector
<
shape
>&
input_shapes
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
{
gemm_impl
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
int8_x4_format
,
compute_fp32
);
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
if
(
solution_idx
==
0
)
{
auto
gemm_item
=
gemm_impl
<
int32_t
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
tune
(
ctx
,
input_shapes
);
}
else
{
// If a tuned solution index is already given, don't tune again but validate
// in case the data was tuned with a different rocBLAS version
auto
gemm_item
=
gemm_impl
<
int32_t
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
validate
(
ctx
,
input_shapes
,
solution_idx
);
}
#else
(
void
)
ctx
,
(
void
)
output_shape
,
(
void
)
input_shapes
;
(
void
)
alpha
,
(
void
)
beta
,
(
void
)
compute_fp32
;
#endif
return
solution_idx
;
}
}
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
View file @
0662a9a3
...
@@ -42,7 +42,7 @@ struct compile_miopen
...
@@ -42,7 +42,7 @@ struct compile_miopen
context
*
ctx
=
nullptr
;
context
*
ctx
=
nullptr
;
std
::
string
name
()
const
{
return
"gpu::compile_miopen"
;
}
std
::
string
name
()
const
{
return
"gpu::compile_miopen"
;
}
void
apply
(
module
&
m
)
const
;
void
apply
(
module
&
m
)
const
;
std
::
size_t
compile
(
operation
&
op
,
instruction_ref
ins
,
bool
format
)
const
;
std
::
size_t
compile
(
operation
&
op
,
instruction_ref
ins
)
const
;
};
};
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/convolution.hpp
View file @
0662a9a3
...
@@ -57,7 +57,6 @@ template <class Op>
...
@@ -57,7 +57,6 @@ template <class Op>
struct
miopen_convolution
struct
miopen_convolution
{
{
Op
op
;
Op
op
;
bool
int8_x4_format
=
false
;
shared
<
convolution_descriptor
>
cd
=
nullptr
;
shared
<
convolution_descriptor
>
cd
=
nullptr
;
miopenConvFwdAlgorithm_t
algo
{};
miopenConvFwdAlgorithm_t
algo
{};
#ifdef MIGRAPHX_HAS_FIND_2_API
#ifdef MIGRAPHX_HAS_FIND_2_API
...
@@ -74,7 +73,6 @@ struct miopen_convolution
...
@@ -74,7 +73,6 @@ struct miopen_convolution
f
(
self
.
solution_object
,
"solution_object"
),
f
(
self
.
solution_object
,
"solution_object"
),
#endif
#endif
f
(
self
.
algo
,
"algo"
),
f
(
self
.
algo
,
"algo"
),
f
(
self
.
int8_x4_format
,
"int8_x4_format"
),
f
(
self
.
solution_id
,
"solution_id"
));
f
(
self
.
solution_id
,
"solution_id"
));
}
}
...
@@ -94,9 +92,9 @@ struct miopen_convolution
...
@@ -94,9 +92,9 @@ struct miopen_convolution
argument
argument
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
{
{
auto
x_desc
=
make_tensor
(
reshape_if_1d
(
args
[
0
].
get_shape
())
,
int8_x4_format
);
auto
x_desc
=
make_tensor
(
reshape_if_1d
(
args
[
0
].
get_shape
()));
auto
w_desc
=
make_tensor
(
reshape_if_1d
(
args
[
1
].
get_shape
())
,
int8_x4_format
);
auto
w_desc
=
make_tensor
(
reshape_if_1d
(
args
[
1
].
get_shape
()));
auto
y_desc
=
make_tensor
(
reshape_if_1d
(
output_shape
));
auto
y_desc
=
make_tensor
(
reshape_if_1d
(
output_shape
));
auto
*
miopen_stream_handle
=
ctx
.
get_stream
().
get_miopen
();
auto
*
miopen_stream_handle
=
ctx
.
get_stream
().
get_miopen
();
auto
workspace_size
=
args
[
2
].
get_shape
().
bytes
();
auto
workspace_size
=
args
[
2
].
get_shape
().
bytes
();
...
@@ -162,8 +160,8 @@ struct miopen_convolution
...
@@ -162,8 +160,8 @@ struct miopen_convolution
shape
find
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
inputs
)
shape
find
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
inputs
)
{
{
shape
workspace_shape
{};
shape
workspace_shape
{};
auto
x_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
0
])
,
int8_x4_format
);
auto
x_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
0
]));
auto
w_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
1
])
,
int8_x4_format
);
auto
w_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
1
]));
auto
y_desc
=
make_tensor
(
reshape_if_1d
(
output_shape
));
auto
y_desc
=
make_tensor
(
reshape_if_1d
(
output_shape
));
auto
*
miopen_stream_handle
=
ctx
.
get_stream
().
get_miopen
();
auto
*
miopen_stream_handle
=
ctx
.
get_stream
().
get_miopen
();
...
@@ -179,13 +177,8 @@ struct miopen_convolution
...
@@ -179,13 +177,8 @@ struct miopen_convolution
workspace_shape
=
shape
{
shape
::
int8_type
,
{
workspace_size
}};
workspace_shape
=
shape
{
shape
::
int8_type
,
{
workspace_size
}};
auto
x_shape
=
inputs
[
0
];
const
auto
&
x_shape
=
inputs
[
0
];
auto
w_shape
=
inputs
[
1
];
const
auto
&
w_shape
=
inputs
[
1
];
if
(
int8_x4_format
)
{
x_shape
=
pack_int8_shape
(
x_shape
);
w_shape
=
pack_int8_shape
(
w_shape
);
}
#ifdef MIGRAPHX_HAS_FIND_2_API
#ifdef MIGRAPHX_HAS_FIND_2_API
{
{
...
@@ -327,8 +320,8 @@ struct miopen_convolution
...
@@ -327,8 +320,8 @@ struct miopen_convolution
": workspace has changed during finalization."
);
": workspace has changed during finalization."
);
}
}
auto
x_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
0
])
,
int8_x4_format
);
auto
x_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
0
]));
auto
w_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
1
])
,
int8_x4_format
);
auto
w_desc
=
make_tensor
(
reshape_if_1d
(
inputs
[
1
]));
auto
y_desc
=
make_tensor
(
reshape_if_1d
(
output_shape
));
auto
y_desc
=
make_tensor
(
reshape_if_1d
(
output_shape
));
auto
status
=
miopenConvolutionForwardCompileSolution
(
ctx
.
get_stream
().
get_miopen
(),
auto
status
=
miopenConvolutionForwardCompileSolution
(
ctx
.
get_stream
().
get_miopen
(),
...
@@ -347,21 +340,6 @@ struct miopen_convolution
...
@@ -347,21 +340,6 @@ struct miopen_convolution
{
{
return
shapes
.
size
()
-
1
;
return
shapes
.
size
()
-
1
;
}
}
inline
shape
pack_int8_shape
(
const
shape
&
s
)
const
{
if
(
s
.
type
()
!=
shape
::
int8_type
)
{
return
s
;
}
auto
lens
=
s
.
lens
();
auto
strides
=
s
.
strides
();
lens
[
1
]
=
(
lens
[
1
]
+
3
)
/
4
*
4
;
strides
[
0
]
=
strides
[
1
]
*
lens
[
1
];
return
{
s
.
type
(),
lens
,
strides
};
}
};
};
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
deleted
100644 → 0
View file @
b74d3a8f
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/config.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
MIGRAPHX_DEVICE_EXPORT
int8_gemm_pack_a
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
);
void
MIGRAPHX_DEVICE_EXPORT
int8_gemm_pack_b
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
);
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/include/migraphx/gpu/gemm.hpp
View file @
0662a9a3
/*
/*
* The MIT License (MIT)
* The MIT License (MIT)
*
*
* Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* of this software and associated documentation files (the "Software"), to deal
...
@@ -40,9 +40,8 @@ inline namespace MIGRAPHX_INLINE_NS {
...
@@ -40,9 +40,8 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
gpu
{
struct
context
;
struct
context
;
void
blas_shape
(
const
shape
&
s
);
shape
transpose_batch
(
const
shape
&
s
,
unsigned
trans_batch
);
shape
transpose_batch
(
const
shape
&
s
,
unsigned
trans_batch
);
void
blas_shape
(
const
shape
&
s
);
template
<
class
Op
>
template
<
class
Op
>
struct
rocblas_gemm
struct
rocblas_gemm
...
@@ -50,9 +49,9 @@ struct rocblas_gemm
...
@@ -50,9 +49,9 @@ struct rocblas_gemm
Op
op
;
Op
op
;
float
alpha
=
1
;
float
alpha
=
1
;
float
beta
=
0
;
float
beta
=
0
;
bool
int8_x4_format
=
true
;
bool
compute_fp32
=
false
;
bool
compute_fp32
=
false
;
unsigned
trans_batch
=
0
;
unsigned
trans_batch
=
0
;
int32_t
solution_idx
=
0
;
template
<
class
Self
,
class
F
>
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
static
auto
reflect
(
Self
&
self
,
F
f
)
...
@@ -60,9 +59,9 @@ struct rocblas_gemm
...
@@ -60,9 +59,9 @@ struct rocblas_gemm
return
pack_join
(
migraphx
::
reflect
(
self
.
op
,
f
),
return
pack_join
(
migraphx
::
reflect
(
self
.
op
,
f
),
pack
(
f
(
self
.
alpha
,
"alpha"
),
pack
(
f
(
self
.
alpha
,
"alpha"
),
f
(
self
.
beta
,
"beta"
),
f
(
self
.
beta
,
"beta"
),
f
(
self
.
int8_x4_format
,
"int8_x4_format"
),
f
(
self
.
compute_fp32
,
"compute_fp32"
),
f
(
self
.
compute_fp32
,
"compute_fp32"
),
f
(
self
.
trans_batch
,
"trans_batch"
)));
f
(
self
.
trans_batch
,
"trans_batch"
),
f
(
self
.
solution_idx
,
"solution_idx"
)));
}
}
std
::
string
name
()
const
std
::
string
name
()
const
...
@@ -78,6 +77,8 @@ struct rocblas_gemm
...
@@ -78,6 +77,8 @@ struct rocblas_gemm
{
{
std
::
vector
<
shape
>
in_shapes
(
inputs
);
std
::
vector
<
shape
>
in_shapes
(
inputs
);
in_shapes
.
pop_back
();
in_shapes
.
pop_back
();
// When input shapes are A, B, C the GEMM equation is C = α AB+ β C where α, β are
// scalars
check_shapes
{
in_shapes
,
*
this
}.
has
(
2
,
3
);
check_shapes
{
in_shapes
,
*
this
}.
has
(
2
,
3
);
blas_shape
(
inputs
[
0
]);
blas_shape
(
inputs
[
0
]);
blas_shape
(
inputs
[
1
]);
blas_shape
(
inputs
[
1
]);
...
@@ -113,17 +114,12 @@ struct rocblas_gemm
...
@@ -113,17 +114,12 @@ struct rocblas_gemm
{
{
if
(
this
->
name
()
==
"gpu::gemm"
)
if
(
this
->
name
()
==
"gpu::gemm"
)
{
{
gemm
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
int8_x4_format
,
compute_fp32
);
gemm
_compute
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
compute_fp32
,
solution_idx
);
}
}
else
else
{
{
gemm
(
ctx
,
gemm_compute
(
output_shape
,
ctx
,
output_shape
,
args
,
int32_t
(
alpha
),
int32_t
(
beta
),
compute_fp32
,
solution_idx
);
args
,
int32_t
(
alpha
),
int32_t
(
beta
),
int8_x4_format
,
compute_fp32
);
}
}
return
args
.
back
();
return
args
.
back
();
}
}
...
@@ -132,6 +128,33 @@ struct rocblas_gemm
...
@@ -132,6 +128,33 @@ struct rocblas_gemm
{
{
return
shapes
.
size
()
-
1
;
return
shapes
.
size
()
-
1
;
}
}
void
finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
input_shapes
)
{
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
if
(
enabled
(
MIGRAPHX_ENABLE_GEMM_TUNING
{})
or
ctx
.
get_exhaustive_tune_flag
())
{
if
(
this
->
name
()
==
"gpu::gemm"
)
{
solution_idx
=
gemm_finalize
(
ctx
,
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
,
solution_idx
);
}
else
{
solution_idx
=
gemm_finalize
(
ctx
,
output_shape
,
input_shapes
,
int32_t
(
alpha
),
int32_t
(
beta
),
compute_fp32
,
solution_idx
);
}
}
#else
// suppress compiler warnings
(
void
)
ctx
,
(
void
)
output_shape
,
(
void
)
input_shapes
;
#endif
}
};
};
}
// namespace gpu
}
// namespace gpu
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment