Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
2f268bc2
"src/sdk/vscode:/vscode.git/clone" did not exist on "5623dbf32952bd7842d3d9c5cf12a8e97e2b1fab"
Commit
2f268bc2
authored
Jun 12, 2022
by
Paul
Browse files
Merge branch 'develop' into mlir-c
parents
f75c5a38
aa7ff911
Changes
205
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
363 additions
and
119 deletions
+363
-119
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+7
-1
src/targets/gpu/driver/run_op.cpp
src/targets/gpu/driver/run_op.cpp
+4
-2
src/targets/gpu/eliminate_workspace.cpp
src/targets/gpu/eliminate_workspace.cpp
+5
-5
src/targets/gpu/fuse_ops.cpp
src/targets/gpu/fuse_ops.cpp
+111
-66
src/targets/gpu/gemm_impl.cpp
src/targets/gpu/gemm_impl.cpp
+38
-6
src/targets/gpu/hip.cpp
src/targets/gpu/hip.cpp
+16
-2
src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
+4
-0
src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
+46
-0
src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
...gets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+9
-0
src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/gemm.hpp
src/targets/gpu/include/migraphx/gpu/gemm.hpp
+7
-25
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+24
-0
src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+4
-1
src/targets/gpu/include/migraphx/gpu/scatter.hpp
src/targets/gpu/include/migraphx/gpu/scatter.hpp
+5
-3
src/targets/gpu/include/migraphx/gpu/schedule_model.hpp
src/targets/gpu/include/migraphx/gpu/schedule_model.hpp
+3
-3
src/targets/gpu/include/migraphx/gpu/sync_device.hpp
src/targets/gpu/include/migraphx/gpu/sync_device.hpp
+1
-1
src/targets/gpu/include/migraphx/gpu/write_literals.hpp
src/targets/gpu/include/migraphx/gpu/write_literals.hpp
+1
-1
src/targets/gpu/jit/gathernd.cpp
src/targets/gpu/jit/gathernd.cpp
+75
-0
No files found.
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
View file @
2f268bc2
...
@@ -176,7 +176,13 @@ template <index_int N, class T, class... Ts>
...
@@ -176,7 +176,13 @@ template <index_int N, class T, class... Ts>
auto
hip_vec_visit_all
(
T
&&
x
,
Ts
&&
...
xs
)
auto
hip_vec_visit_all
(
T
&&
x
,
Ts
&&
...
xs
)
{
{
return
[
&
](
auto
f
)
{
return
[
&
](
auto
f
)
{
hip_visit_all_impl
(
get_shape
(
x
),
auto
sx
=
get_shape
(
x
);
auto
lens
=
sx
.
lens
();
assert
(
lens
.
back
()
%
N
==
0
);
assert
(
sx
.
strides
().
back
()
==
1
);
lens
.
back
()
/=
N
;
shape
vec_sx
{
sx
.
type
(),
lens
};
hip_visit_all_impl
(
vec_sx
,
make_hip_convert
([](
auto
*
p
)
{
return
as_vec
<
N
>
(
device_cast
(
p
));
}),
make_hip_convert
([](
auto
*
p
)
{
return
as_vec
<
N
>
(
device_cast
(
p
));
}),
f
,
f
,
x
,
x
,
...
...
src/targets/gpu/driver/run_op.cpp
View file @
2f268bc2
...
@@ -17,8 +17,10 @@ struct run_op : action<run_op>
...
@@ -17,8 +17,10 @@ struct run_op : action<run_op>
auto
name
=
v
.
at
(
"name"
).
to
<
std
::
string
>
();
auto
name
=
v
.
at
(
"name"
).
to
<
std
::
string
>
();
if
(
not
contains
(
name
,
"::"
))
if
(
not
contains
(
name
,
"::"
))
name
=
"gpu::"
+
name
;
name
=
"gpu::"
+
name
;
auto
op
=
make_op
(
name
);
auto
op
=
make_op
(
name
);
double
t
=
time_op
(
ctx
,
op
,
inputs
);
if
(
v
.
contains
(
"fields"
))
op
.
from_value
(
v
.
at
(
"fields"
));
double
t
=
time_op
(
ctx
,
op
,
inputs
,
p
.
get
(
v
,
"iterations"
,
100
));
std
::
cout
<<
op
<<
": "
<<
t
<<
"ms"
<<
std
::
endl
;
std
::
cout
<<
op
<<
": "
<<
t
<<
"ms"
<<
std
::
endl
;
}
}
};
};
...
...
src/targets/gpu/eliminate_workspace.cpp
View file @
2f268bc2
...
@@ -11,11 +11,11 @@ namespace migraphx {
...
@@ -11,11 +11,11 @@ namespace migraphx {
inline
namespace
MIGRAPHX_INLINE_NS
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
gpu
{
void
eliminate_workspace
::
apply
(
module
&
p
)
const
void
eliminate_workspace
::
apply
(
module
&
m
)
const
{
{
std
::
size_t
n
=
0
;
std
::
size_t
n
=
0
;
std
::
vector
<
instruction_ref
>
allocs
;
std
::
vector
<
instruction_ref
>
allocs
;
for
(
auto
ins
:
iterator_for
(
p
))
for
(
auto
ins
:
iterator_for
(
m
))
{
{
if
(
ins
->
outputs
().
size
()
!=
1
)
if
(
ins
->
outputs
().
size
()
!=
1
)
continue
;
continue
;
...
@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const
...
@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const
}
}
if
(
n
>
0
)
if
(
n
>
0
)
{
{
auto
ws
=
p
.
add_parameter
(
"workspace"
,
shape
{
shape
::
int8_type
,
{
n
}});
auto
ws
=
m
.
add_parameter
(
"workspace"
,
shape
{
shape
::
int8_type
,
{
n
}});
for
(
auto
&&
a
:
allocs
)
for
(
auto
&&
a
:
allocs
)
{
{
p
.
replace_instruction
(
a
,
ws
);
m
.
replace_instruction
(
a
,
ws
);
p
.
remove_instruction
(
a
);
m
.
remove_instruction
(
a
);
}
}
}
}
}
}
...
...
src/targets/gpu/fuse_ops.cpp
View file @
2f268bc2
...
@@ -317,7 +317,7 @@ struct find_layernorm
...
@@ -317,7 +317,7 @@ struct find_layernorm
{
{
auto
matcher
()
const
{
return
match
::
layernorm
(
&
gpu_name
);
}
auto
matcher
()
const
{
return
match
::
layernorm
(
&
gpu_name
);
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
x_ins
=
r
.
instructions
[
"x"
];
...
@@ -332,7 +332,7 @@ struct find_layernorm
...
@@ -332,7 +332,7 @@ struct find_layernorm
if
(
relements
>
1024
or
(
relements
%
4
!=
0
and
relements
>
256
))
if
(
relements
>
1024
or
(
relements
%
4
!=
0
and
relements
>
256
))
return
;
return
;
p
.
replace_instruction
(
ins
,
hip_layernorm
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_layernorm
{},
x_ins
,
args
.
back
());
}
}
};
};
...
@@ -344,11 +344,11 @@ struct find_triadd_layernorm
...
@@ -344,11 +344,11 @@ struct find_triadd_layernorm
match
::
used_once
(),
match
::
all_of
[
match
::
inputs
()](
match
::
standard_shape
()))));
match
::
used_once
(),
match
::
all_of
[
match
::
inputs
()](
match
::
standard_shape
()))));
}
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
triadd
=
ins
->
inputs
().
front
();
auto
triadd
=
ins
->
inputs
().
front
();
p
.
replace_instruction
(
ins
,
hip_triadd_layernorm
{},
triadd
->
inputs
());
m
.
replace_instruction
(
ins
,
hip_triadd_layernorm
{},
triadd
->
inputs
());
}
}
};
};
...
@@ -356,13 +356,13 @@ struct find_gelu
...
@@ -356,13 +356,13 @@ struct find_gelu
{
{
auto
matcher
()
const
{
return
match
::
gelu_erf
(
&
gpu_name
);
}
auto
matcher
()
const
{
return
match
::
gelu_erf
(
&
gpu_name
);
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
args
=
ins
->
inputs
();
auto
args
=
ins
->
inputs
();
p
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
}
}
};
};
...
@@ -373,7 +373,7 @@ struct find_add_gelu
...
@@ -373,7 +373,7 @@ struct find_add_gelu
return
match
::
name
(
"gpu::gelu"
)(
match
::
arg
(
0
)(
match
::
name
(
"gpu::add"
).
bind
(
"add"
)));
return
match
::
name
(
"gpu::gelu"
)(
match
::
arg
(
0
)(
match
::
name
(
"gpu::add"
).
bind
(
"add"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
...
@@ -382,7 +382,7 @@ struct find_add_gelu
...
@@ -382,7 +382,7 @@ struct find_add_gelu
move_broadcasted_back
(
args
);
move_broadcasted_back
(
args
);
args
.
back
()
=
ins
->
inputs
().
back
();
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_add_gelu
{},
args
);
m
.
replace_instruction
(
ins
,
hip_add_gelu
{},
args
);
}
}
};
};
...
@@ -392,16 +392,16 @@ struct find_gelu_new
...
@@ -392,16 +392,16 @@ struct find_gelu_new
auto
matcher
()
const
{
return
match
::
gelu_tanh
(
&
gpu_name
);
}
auto
matcher
()
const
{
return
match
::
gelu_tanh
(
&
gpu_name
);
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
args
=
ins
->
inputs
();
auto
args
=
ins
->
inputs
();
if
(
fast_math
)
if
(
fast_math
)
p
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_gelu
{},
x_ins
,
args
.
back
());
else
else
p
.
replace_instruction
(
ins
,
hip_gelu_new
{},
x_ins
,
args
.
back
());
m
.
replace_instruction
(
ins
,
hip_gelu_new
{},
x_ins
,
args
.
back
());
}
}
};
};
...
@@ -412,7 +412,7 @@ struct find_add_gelu_new
...
@@ -412,7 +412,7 @@ struct find_add_gelu_new
return
match
::
name
(
"gpu::gelu_new"
)(
match
::
arg
(
0
)(
match
::
name
(
"gpu::add"
).
bind
(
"add"
)));
return
match
::
name
(
"gpu::gelu_new"
)(
match
::
arg
(
0
)(
match
::
name
(
"gpu::add"
).
bind
(
"add"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
...
@@ -421,7 +421,7 @@ struct find_add_gelu_new
...
@@ -421,7 +421,7 @@ struct find_add_gelu_new
move_broadcasted_back
(
args
);
move_broadcasted_back
(
args
);
args
.
back
()
=
ins
->
inputs
().
back
();
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_add_gelu_new
{},
args
);
m
.
replace_instruction
(
ins
,
hip_add_gelu_new
{},
args
);
}
}
};
};
...
@@ -436,7 +436,7 @@ struct find_add_clip
...
@@ -436,7 +436,7 @@ struct find_add_clip
.
bind
(
"add"
)));
.
bind
(
"add"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
...
@@ -449,9 +449,9 @@ struct find_add_clip
...
@@ -449,9 +449,9 @@ struct find_add_clip
add_args
.
pop_back
();
add_args
.
pop_back
();
add_args
.
insert
(
add_args
.
end
(),
std
::
next
(
ins_args
.
begin
()),
ins_args
.
end
());
add_args
.
insert
(
add_args
.
end
(),
std
::
next
(
ins_args
.
begin
()),
ins_args
.
end
());
if
(
add_ins
->
name
()
==
"gpu::add"
)
if
(
add_ins
->
name
()
==
"gpu::add"
)
p
.
replace_instruction
(
ins
,
hip_add_clip
{},
add_args
);
m
.
replace_instruction
(
ins
,
hip_add_clip
{},
add_args
);
else
if
(
add_ins
->
name
()
==
"gpu::triadd"
)
else
if
(
add_ins
->
name
()
==
"gpu::triadd"
)
p
.
replace_instruction
(
ins
,
hip_triadd_clip
{},
add_args
);
m
.
replace_instruction
(
ins
,
hip_triadd_clip
{},
add_args
);
}
}
};
};
...
@@ -471,7 +471,7 @@ struct find_add_unary
...
@@ -471,7 +471,7 @@ struct find_add_unary
.
bind
(
"add"
)));
.
bind
(
"add"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
...
@@ -482,9 +482,9 @@ struct find_add_unary
...
@@ -482,9 +482,9 @@ struct find_add_unary
// Use the allocation from the relu operator
// Use the allocation from the relu operator
args
.
back
()
=
ins
->
inputs
().
back
();
args
.
back
()
=
ins
->
inputs
().
back
();
if
(
add_ins
->
name
()
==
"gpu::add"
)
if
(
add_ins
->
name
()
==
"gpu::add"
)
p
.
replace_instruction
(
ins
,
binary_add_op
,
args
);
m
.
replace_instruction
(
ins
,
binary_add_op
,
args
);
else
if
(
add_ins
->
name
()
==
"gpu::triadd"
)
else
if
(
add_ins
->
name
()
==
"gpu::triadd"
)
p
.
replace_instruction
(
ins
,
ternary_add_op
,
args
);
m
.
replace_instruction
(
ins
,
ternary_add_op
,
args
);
}
}
};
};
...
@@ -499,7 +499,7 @@ struct find_triadd
...
@@ -499,7 +499,7 @@ struct find_triadd
.
bind
(
"input"
)));
.
bind
(
"input"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
input_ins
=
r
.
instructions
[
"input"
];
auto
input_ins
=
r
.
instructions
[
"input"
];
...
@@ -514,7 +514,7 @@ struct find_triadd
...
@@ -514,7 +514,7 @@ struct find_triadd
move_broadcasted_back
(
args
);
move_broadcasted_back
(
args
);
args
.
back
()
=
ins
->
inputs
().
back
();
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_triadd
{},
args
);
m
.
replace_instruction
(
ins
,
hip_triadd
{},
args
);
}
}
};
};
...
@@ -526,7 +526,7 @@ struct find_mul_add
...
@@ -526,7 +526,7 @@ struct find_mul_add
match
::
name
(
"gpu::mul"
)(
match
::
used_once
()).
bind
(
"mul"
),
match
::
any
().
bind
(
"b"
)));
match
::
name
(
"gpu::mul"
)(
match
::
used_once
()).
bind
(
"mul"
),
match
::
any
().
bind
(
"b"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
mul_ins
=
r
.
instructions
[
"mul"
];
auto
mul_ins
=
r
.
instructions
[
"mul"
];
auto
b_ins
=
r
.
instructions
[
"b"
];
auto
b_ins
=
r
.
instructions
[
"b"
];
...
@@ -539,7 +539,7 @@ struct find_mul_add
...
@@ -539,7 +539,7 @@ struct find_mul_add
args
.
insert
(
std
::
prev
(
args
.
end
()),
b_ins
);
args
.
insert
(
std
::
prev
(
args
.
end
()),
b_ins
);
args
.
back
()
=
ins
->
inputs
().
back
();
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_mul_add
{},
args
);
m
.
replace_instruction
(
ins
,
hip_mul_add
{},
args
);
}
}
};
};
...
@@ -551,7 +551,7 @@ struct find_mul_add_relu
...
@@ -551,7 +551,7 @@ struct find_mul_add_relu
match
::
arg
(
0
)(
match
::
name
(
"gpu::mul_add"
)(
match
::
used_once
()).
bind
(
"mul_add"
)));
match
::
arg
(
0
)(
match
::
name
(
"gpu::mul_add"
)(
match
::
used_once
()).
bind
(
"mul_add"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
mul_add_ins
=
r
.
instructions
[
"mul_add"
];
auto
mul_add_ins
=
r
.
instructions
[
"mul_add"
];
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
...
@@ -559,7 +559,7 @@ struct find_mul_add_relu
...
@@ -559,7 +559,7 @@ struct find_mul_add_relu
// Use the allocation from the relu operator
// Use the allocation from the relu operator
args
.
back
()
=
ins
->
inputs
().
back
();
args
.
back
()
=
ins
->
inputs
().
back
();
p
.
replace_instruction
(
ins
,
hip_mul_add_relu
{},
args
);
m
.
replace_instruction
(
ins
,
hip_mul_add_relu
{},
args
);
}
}
};
};
...
@@ -682,7 +682,7 @@ struct miopen_fusion
...
@@ -682,7 +682,7 @@ struct miopen_fusion
struct
miopen_conv_bias
struct
miopen_conv_bias
{
{
op
::
convolution
op
;
op
::
convolution
op
;
fusion
f
=
{};
fusion
f
p
=
{};
fusion
::
op_t
conv
=
{};
fusion
::
op_t
conv
=
{};
fusion
::
op_t
bias
=
{};
fusion
::
op_t
bias
=
{};
...
@@ -706,19 +706,19 @@ struct miopen_conv_bias
...
@@ -706,19 +706,19 @@ struct miopen_conv_bias
float
beta
=
0
;
float
beta
=
0
;
miopenSetOpArgsConvForward
(
fargs
.
get
(),
conv
,
&
alpha
,
&
beta
,
args
[
1
].
implicit
());
miopenSetOpArgsConvForward
(
fargs
.
get
(),
conv
,
&
alpha
,
&
beta
,
args
[
1
].
implicit
());
miopenSetOpArgsBiasForward
(
fargs
.
get
(),
bias
,
&
alpha
,
&
beta
,
args
[
3
].
implicit
());
miopenSetOpArgsBiasForward
(
fargs
.
get
(),
bias
,
&
alpha
,
&
beta
,
args
[
3
].
implicit
());
return
f
.
execute
(
ctx
,
fargs
,
args
[
0
],
args
[
4
]);
return
f
p
.
execute
(
ctx
,
fargs
,
args
[
0
],
args
[
4
]);
}
}
void
finalize
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
shape
>&
inputs
)
void
finalize
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
shape
>&
inputs
)
{
{
f
=
fusion
(
inputs
[
0
]);
f
p
=
fusion
(
inputs
[
0
]);
conv
=
f
.
create_conv
(
op
,
inputs
[
1
]);
conv
=
f
p
.
create_conv
(
op
,
inputs
[
1
]);
bias
=
f
.
create_bias
(
inputs
[
3
]);
bias
=
f
p
.
create_bias
(
inputs
[
3
]);
if
(
not
f
.
compile
(
ctx
))
if
(
not
f
p
.
compile
(
ctx
))
MIGRAPHX_THROW
(
"Failed to compile fusion plan"
);
MIGRAPHX_THROW
(
"Failed to compile fusion plan"
);
}
}
shape
get_workspace
(
context
&
ctx
)
{
return
f
.
get_workspace
(
ctx
);
}
shape
get_workspace
(
context
&
ctx
)
{
return
f
p
.
get_workspace
(
ctx
);
}
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
{
return
shapes
.
size
()
-
1
;
return
shapes
.
size
()
-
1
;
...
@@ -729,7 +729,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias)
...
@@ -729,7 +729,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias)
struct
miopen_conv_bias_relu
struct
miopen_conv_bias_relu
{
{
op
::
convolution
op
;
op
::
convolution
op
;
fusion
f
=
{};
fusion
f
p
=
{};
fusion
::
op_t
conv
=
{};
fusion
::
op_t
conv
=
{};
fusion
::
op_t
bias
=
{};
fusion
::
op_t
bias
=
{};
fusion
::
op_t
relu
=
{};
fusion
::
op_t
relu
=
{};
...
@@ -755,18 +755,18 @@ struct miopen_conv_bias_relu
...
@@ -755,18 +755,18 @@ struct miopen_conv_bias_relu
miopenSetOpArgsConvForward
(
fargs
.
get
(),
conv
,
&
alpha
,
&
beta
,
args
[
1
].
implicit
());
miopenSetOpArgsConvForward
(
fargs
.
get
(),
conv
,
&
alpha
,
&
beta
,
args
[
1
].
implicit
());
miopenSetOpArgsBiasForward
(
fargs
.
get
(),
bias
,
&
alpha
,
&
beta
,
args
[
3
].
implicit
());
miopenSetOpArgsBiasForward
(
fargs
.
get
(),
bias
,
&
alpha
,
&
beta
,
args
[
3
].
implicit
());
miopenSetOpArgsActivForward
(
fargs
.
get
(),
relu
,
&
alpha
,
&
beta
,
0
,
0
,
0
);
miopenSetOpArgsActivForward
(
fargs
.
get
(),
relu
,
&
alpha
,
&
beta
,
0
,
0
,
0
);
return
f
.
execute
(
ctx
,
fargs
,
args
[
0
],
args
[
4
]);
return
f
p
.
execute
(
ctx
,
fargs
,
args
[
0
],
args
[
4
]);
}
}
void
finalize
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
shape
>&
inputs
)
void
finalize
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
shape
>&
inputs
)
{
{
f
=
fusion
(
inputs
[
0
]);
f
p
=
fusion
(
inputs
[
0
]);
conv
=
f
.
create_conv
(
op
,
inputs
[
1
]);
conv
=
f
p
.
create_conv
(
op
,
inputs
[
1
]);
bias
=
f
.
create_bias
(
inputs
[
3
]);
bias
=
f
p
.
create_bias
(
inputs
[
3
]);
relu
=
f
.
create_relu
();
relu
=
f
p
.
create_relu
();
f
.
compile
(
ctx
);
f
p
.
compile
(
ctx
);
}
}
shape
get_workspace
(
context
&
ctx
)
{
return
f
.
get_workspace
(
ctx
);
}
shape
get_workspace
(
context
&
ctx
)
{
return
f
p
.
get_workspace
(
ctx
);
}
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
{
return
shapes
.
size
()
-
1
;
return
shapes
.
size
()
-
1
;
...
@@ -784,7 +784,7 @@ auto conv_bias(Ms... ms)
...
@@ -784,7 +784,7 @@ auto conv_bias(Ms... ms)
}
}
template
<
class
Op
>
template
<
class
Op
>
void
apply_conv_bias
(
context
&
ctx
,
module
&
p
,
match
::
matcher_result
r
)
void
apply_conv_bias
(
context
&
ctx
,
module
&
m
,
const
match
::
matcher_result
&
r
)
{
{
auto
conv_ins
=
r
.
instructions
[
"conv"
];
auto
conv_ins
=
r
.
instructions
[
"conv"
];
auto
bias_ins
=
r
.
instructions
[
"bias"
];
auto
bias_ins
=
r
.
instructions
[
"bias"
];
...
@@ -799,7 +799,7 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
...
@@ -799,7 +799,7 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
// TODO: Insert ws allocation
// TODO: Insert ws allocation
auto
ws
=
cb
.
get_workspace
(
ctx
);
auto
ws
=
cb
.
get_workspace
(
ctx
);
(
void
)
ws
;
(
void
)
ws
;
p
.
replace_instruction
(
ins
,
cb
,
input_ins
,
weights_ins
,
old_ws_ins
,
bias_ins
,
alloc_ins
);
m
.
replace_instruction
(
ins
,
cb
,
input_ins
,
weights_ins
,
old_ws_ins
,
bias_ins
,
alloc_ins
);
}
}
inline
auto
precompile_name
(
std
::
string
s
)
// NOLINT
inline
auto
precompile_name
(
std
::
string
s
)
// NOLINT
...
@@ -830,9 +830,9 @@ struct find_conv_bias
...
@@ -830,9 +830,9 @@ struct find_conv_bias
match
::
output
(
match
::
name
(
std
::
unordered_set
<
std
::
string
>
{
"gpu::relu"
}))));
match
::
output
(
match
::
name
(
std
::
unordered_set
<
std
::
string
>
{
"gpu::relu"
}))));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
apply_conv_bias
<
miopen_conv_bias
>
(
*
ctx
,
p
,
std
::
move
(
r
)
);
apply_conv_bias
<
miopen_conv_bias
>
(
*
ctx
,
m
,
r
);
}
}
};
};
...
@@ -841,9 +841,9 @@ struct find_conv_bias_relu
...
@@ -841,9 +841,9 @@ struct find_conv_bias_relu
context
*
ctx
=
nullptr
;
context
*
ctx
=
nullptr
;
auto
matcher
()
const
{
return
match
::
name
(
"gpu::relu"
)(
match
::
arg
(
0
)(
conv_bias
()));
}
auto
matcher
()
const
{
return
match
::
name
(
"gpu::relu"
)(
match
::
arg
(
0
)(
conv_bias
()));
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
apply_conv_bias
<
miopen_conv_bias_relu
>
(
*
ctx
,
p
,
std
::
move
(
r
)
);
apply_conv_bias
<
miopen_conv_bias_relu
>
(
*
ctx
,
m
,
r
);
}
}
};
};
...
@@ -858,7 +858,7 @@ struct find_conv_pointwise
...
@@ -858,7 +858,7 @@ struct find_conv_pointwise
fusable_conv
(
match
::
used_once
()).
bind
(
"conv"
)));
fusable_conv
(
match
::
used_once
()).
bind
(
"conv"
)));
}
}
void
apply
(
module
&
m
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
conv_ins
=
r
.
instructions
[
"conv"
];
auto
conv_ins
=
r
.
instructions
[
"conv"
];
auto
bias_ins
=
r
.
instructions
[
"bias"
];
auto
bias_ins
=
r
.
instructions
[
"bias"
];
...
@@ -876,7 +876,6 @@ struct find_conv_pointwise
...
@@ -876,7 +876,6 @@ struct find_conv_pointwise
{
{
if
(
i
.
name
()[
0
]
==
'@'
)
if
(
i
.
name
()[
0
]
==
'@'
)
continue
;
continue
;
auto
inputs
=
to_shapes
(
i
.
inputs
());
op
.
ops
.
push_back
({{
i
.
get_operator
()}});
op
.
ops
.
push_back
({{
i
.
get_operator
()}});
}
}
std
::
vector
<
instruction_ref
>
inputs
=
{
input_ins
,
weights_ins
,
bias_ins
,
alloc_ins
};
std
::
vector
<
instruction_ref
>
inputs
=
{
input_ins
,
weights_ins
,
bias_ins
,
alloc_ins
};
...
@@ -897,7 +896,7 @@ struct find_gemm_add
...
@@ -897,7 +896,7 @@ struct find_gemm_add
match
::
name
(
"gpu::gemm"
)(
match
::
nargs
(
3
)).
bind
(
"gemm"
)));
match
::
name
(
"gpu::gemm"
)(
match
::
nargs
(
3
)).
bind
(
"gemm"
)));
}
}
void
apply
(
module
&
p
,
match
::
matcher_result
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
gemm_ins
=
r
.
instructions
[
"gemm"
];
auto
gemm_ins
=
r
.
instructions
[
"gemm"
];
...
@@ -909,26 +908,68 @@ struct find_gemm_add
...
@@ -909,26 +908,68 @@ struct find_gemm_add
if
(
not
float_equal
(
gemm
.
beta
,
0
))
if
(
not
float_equal
(
gemm
.
beta
,
0
))
return
;
return
;
if
(
std
::
any_of
(
ins
->
inputs
().
begin
(),
ins
->
inputs
().
end
(),
[](
auto
i
)
{
return
not
i
->
get_shape
().
standard
();
}))
return
;
auto
inputs
=
gemm_ins
->
inputs
();
auto
inputs
=
gemm_ins
->
inputs
();
inputs
.
pop_back
();
inputs
.
pop_back
();
auto
copy_ins
=
c_ins
;
auto
copy_ins
=
c_ins
;
// Insert copy
// Insert copy
if
(
ins
==
p
.
end
()
or
c_ins
->
outputs
().
size
()
>
1
or
c_ins
->
inputs
().
empty
())
if
(
ins
==
m
.
end
()
or
c_ins
->
outputs
().
size
()
>
1
or
c_ins
->
inputs
().
empty
())
{
{
copy_ins
=
p
.
insert_instruction
(
ins
,
hip_copy
{},
c_ins
,
ins
->
inputs
().
back
());
copy_ins
=
m
.
insert_instruction
(
ins
,
hip_copy
{},
c_ins
,
ins
->
inputs
().
back
());
}
}
inputs
.
push_back
(
copy_ins
);
inputs
.
push_back
(
copy_ins
);
inputs
.
push_back
(
copy_ins
);
inputs
.
push_back
(
copy_ins
);
gemm
.
beta
=
1
;
gemm
.
beta
=
1
;
p
.
replace_instruction
(
ins
,
gemm
,
inputs
);
m
.
replace_instruction
(
ins
,
gemm
,
inputs
);
}
};
auto
pointwise_name
(
const
std
::
string
&
s
)
{
return
precompile_name
(
"pointwise"
)(
match
::
make_basic_pred_matcher
([
=
](
auto
ins
)
{
module_ref
pm
=
ins
->
module_inputs
().
front
();
auto
n
=
std
::
count_if
(
pm
->
begin
(),
pm
->
end
(),
[
&
](
auto
&
i
)
{
return
i
.
name
()
==
s
;
});
if
(
n
!=
1
)
return
false
;
return
std
::
all_of
(
pm
->
begin
(),
pm
->
end
(),
[
&
](
auto
&
i
)
{
return
starts_with
(
i
.
name
(),
"@"
)
or
i
.
name
()
==
s
;
});
}));
}
struct
find_gemm_pointwise
{
auto
matcher
()
const
{
return
pointwise_name
(
"add"
)(
match
::
nargs
(
3
),
match
::
all_of
[
match
::
inputs
()](
match
::
standard_shape
()),
match
::
either_arg
(
0
,
1
)(
match
::
used_once
().
bind
(
"c"
),
match
::
name
(
"gpu::gemm"
)(
match
::
nargs
(
3
)).
bind
(
"gemm"
)));
}
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
gemm_ins
=
r
.
instructions
[
"gemm"
];
auto
c_ins
=
r
.
instructions
[
"c"
];
auto
gemm
=
any_cast
<
rocblas_gemm
<
op
::
dot
>>
(
gemm_ins
->
get_operator
());
// Already fused gemm
if
(
not
float_equal
(
gemm
.
beta
,
0
))
return
;
auto
inputs
=
gemm_ins
->
inputs
();
inputs
.
pop_back
();
inputs
.
push_back
(
c_ins
);
inputs
.
push_back
(
ins
->
inputs
().
back
());
gemm
.
beta
=
1
;
m
.
replace_instruction
(
ins
,
gemm
,
inputs
);
}
}
};
};
...
@@ -939,23 +980,23 @@ struct find_commutative_broadcast
...
@@ -939,23 +980,23 @@ struct find_commutative_broadcast
return
match
::
name
(
"gpu::add"
,
"gpu::mul"
)(
match
::
arg
(
1
)(
match
::
broadcast_shape
()));
return
match
::
name
(
"gpu::add"
,
"gpu::mul"
)(
match
::
arg
(
1
)(
match
::
broadcast_shape
()));
}
}
void
apply
(
module
&
p
,
const
match
::
matcher_result
&
r
)
const
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
{
auto
ins
=
r
.
result
;
auto
ins
=
r
.
result
;
auto
args
=
ins
->
inputs
();
auto
args
=
ins
->
inputs
();
move_broadcasted_back
(
args
);
move_broadcasted_back
(
args
);
p
.
replace_instruction
(
ins
,
ins
->
get_operator
(),
args
);
m
.
replace_instruction
(
ins
,
ins
->
get_operator
(),
args
);
}
}
};
};
}
// namespace
}
// namespace
void
fuse_ops
::
apply
(
module
&
p
)
const
void
fuse_ops
::
apply
(
module
&
m
)
const
{
{
match
::
find_matches
(
p
,
find_gelu
{},
find_gelu_new
{
fast_math
});
match
::
find_matches
(
m
,
find_gelu
{},
find_gelu_new
{
fast_math
});
run_passes
(
p
,
{
dead_code_elimination
{}});
run_passes
(
m
,
{
dead_code_elimination
{}});
match
::
find_matches
(
p
,
find_triadd
{});
match
::
find_matches
(
m
,
find_triadd
{});
match
::
find_matches
(
p
,
match
::
find_matches
(
m
,
find_layernorm
{},
find_layernorm
{},
find_conv_pointwise
{
ctx
},
find_conv_pointwise
{
ctx
},
find_conv_bias_relu
{
ctx
},
find_conv_bias_relu
{
ctx
},
...
@@ -968,8 +1009,12 @@ void fuse_ops::apply(module& p) const
...
@@ -968,8 +1009,12 @@ void fuse_ops::apply(module& p) const
find_add_unary
{
"gpu::sigmoid"
,
hip_add_sigmoid
{},
hip_triadd_sigmoid
{}},
find_add_unary
{
"gpu::sigmoid"
,
hip_add_sigmoid
{},
hip_triadd_sigmoid
{}},
find_add_unary
{
"gpu::tanh"
,
hip_add_tanh
{},
hip_triadd_tanh
{}},
find_add_unary
{
"gpu::tanh"
,
hip_add_tanh
{},
hip_triadd_tanh
{}},
find_add_clip
{});
find_add_clip
{});
run_passes
(
p
,
{
dead_code_elimination
{}});
run_passes
(
m
,
{
dead_code_elimination
{}});
match
::
find_matches
(
p
,
find_triadd_layernorm
{},
find_gemm_add
{},
find_commutative_broadcast
{});
match
::
find_matches
(
m
,
find_triadd_layernorm
{},
find_gemm_add
{},
find_gemm_pointwise
{},
find_commutative_broadcast
{});
}
}
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/gemm_impl.cpp
View file @
2f268bc2
#include <rocblas.h>
#include <rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp>
namespace
migraphx
{
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type)
...
@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type)
MIGRAPHX_THROW
(
"ROCBLAS_GEMM: data type not supported!"
);
MIGRAPHX_THROW
(
"ROCBLAS_GEMM: data type not supported!"
);
}
}
void
blas_shape
(
const
shape
&
s
)
{
if
(
s
.
lens
().
size
()
<
2
)
return
;
if
(
std
::
none_of
(
s
.
strides
().
end
()
-
2
,
s
.
strides
().
end
(),
[
&
](
auto
i
)
{
return
i
==
1
;
}))
MIGRAPHX_THROW
(
"GPU_GEMM: needs to have one matrix stride as 1"
);
if
(
s
.
lens
().
size
()
<
3
)
return
;
shape
batch_shape
{
s
.
type
(),
{
s
.
lens
().
begin
(),
s
.
lens
().
end
()
-
2
},
{
s
.
strides
().
begin
(),
s
.
strides
().
end
()
-
2
}};
auto
batch_shapes
=
reduce_dims
({
batch_shape
});
if
(
batch_shapes
.
front
().
lens
().
size
()
!=
1
)
MIGRAPHX_THROW
(
"GPU_GEMM: Batch dimension is not collapsible"
);
}
template
<
class
R
,
class
...
Ts
,
class
...
Us
>
template
<
class
R
,
class
...
Ts
,
class
...
Us
>
R
rocblas_invoke
(
R
(
*
f
)(
Ts
...),
Us
...
xs
)
R
rocblas_invoke
(
R
(
*
f
)(
Ts
...),
Us
...
xs
)
{
{
...
@@ -36,6 +53,18 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
...
@@ -36,6 +53,18 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
return
f
(
xs
...,
nullptr
,
nullptr
);
return
f
(
xs
...,
nullptr
,
nullptr
);
}
}
static
bool
is_transposed
(
const
shape
&
s
)
{
if
(
not
s
.
transposed
())
return
false
;
return
s
.
strides
().
back
()
!=
1
;
}
static
rocblas_int
get_batch_stride
(
const
argument
&
a
)
{
return
a
.
get_shape
().
strides
()[
a
.
get_shape
().
strides
().
size
()
-
3
];
}
template
<
class
T
>
template
<
class
T
>
void
gemm_impl
(
context
&
ctx
,
void
gemm_impl
(
context
&
ctx
,
const
shape
&
output_shape
,
const
shape
&
output_shape
,
...
@@ -45,8 +74,8 @@ void gemm_impl(context& ctx,
...
@@ -45,8 +74,8 @@ void gemm_impl(context& ctx,
bool
int8_x4_format
,
bool
int8_x4_format
,
bool
compute_fp32
)
bool
compute_fp32
)
{
{
bool
transa
=
args
[
0
].
get_shape
()
.
transposed
(
);
bool
transa
=
is_transposed
(
args
[
0
].
get_shape
());
bool
transb
=
args
[
1
].
get_shape
()
.
transposed
(
);
bool
transb
=
is_transposed
(
args
[
1
].
get_shape
());
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
auto
dim_0
=
n_dim
-
2
;
...
@@ -142,6 +171,9 @@ void gemm_impl(context& ctx,
...
@@ -142,6 +171,9 @@ void gemm_impl(context& ctx,
}
}
else
else
{
{
auto
a_stride
=
get_batch_stride
(
args
[
0
]);
auto
b_stride
=
get_batch_stride
(
args
[
1
]);
auto
c_stride
=
get_batch_stride
(
args
[
2
]);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
ctx
.
get_stream
().
get_rocblas
(),
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
...
@@ -153,20 +185,20 @@ void gemm_impl(context& ctx,
...
@@ -153,20 +185,20 @@ void gemm_impl(context& ctx,
to_pointer
(
args
.
at
(
1
)),
to_pointer
(
args
.
at
(
1
)),
arg_type
,
arg_type
,
ldb
,
ldb
,
k
*
n
,
b_stride
,
to_pointer
(
args
.
at
(
0
)),
to_pointer
(
args
.
at
(
0
)),
arg_type
,
arg_type
,
lda
,
lda
,
m
*
k
,
a_stride
,
beta_v
,
beta_v
,
to_pointer
(
args
[
2
]),
to_pointer
(
args
[
2
]),
output_type
,
output_type
,
ldc
,
ldc
,
m
*
n
,
c_stride
,
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
]),
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
]),
output_type
,
output_type
,
ldc
,
ldc
,
m
*
n
,
c_stride
,
num_matrices
,
num_matrices
,
compute_type
,
compute_type
,
rocblas_gemm_algo_standard
,
rocblas_gemm_algo_standard
,
...
...
src/targets/gpu/hip.cpp
View file @
2f268bc2
...
@@ -27,6 +27,15 @@ using hip_host_ptr = MIGRAPHX_MANAGE_PTR(void, hipHostUnregister);
...
@@ -27,6 +27,15 @@ using hip_host_ptr = MIGRAPHX_MANAGE_PTR(void, hipHostUnregister);
std
::
string
hip_error
(
int
error
)
{
return
hipGetErrorString
(
static_cast
<
hipError_t
>
(
error
));
}
std
::
string
hip_error
(
int
error
)
{
return
hipGetErrorString
(
static_cast
<
hipError_t
>
(
error
));
}
bool
is_device_ptr
(
const
void
*
ptr
)
{
hipPointerAttribute_t
attr
;
auto
status
=
hipPointerGetAttributes
(
&
attr
,
ptr
);
if
(
status
!=
hipSuccess
)
return
false
;
return
attr
.
memoryType
==
hipMemoryTypeDevice
;
}
std
::
size_t
get_available_gpu_memory
()
std
::
size_t
get_available_gpu_memory
()
{
{
size_t
free
;
size_t
free
;
...
@@ -50,8 +59,8 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
...
@@ -50,8 +59,8 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
{
{
if
(
sz
>
get_available_gpu_memory
())
if
(
sz
>
get_available_gpu_memory
())
MIGRAPHX_THROW
(
"Memory not available to allocate buffer: "
+
std
::
to_string
(
sz
));
MIGRAPHX_THROW
(
"Memory not available to allocate buffer: "
+
std
::
to_string
(
sz
));
void
*
result
;
void
*
result
=
nullptr
;
auto
status
=
host
?
hipHostMalloc
(
&
result
,
sz
)
:
hipMalloc
(
&
result
,
sz
);
auto
status
=
host
?
hipHostMalloc
(
&
result
,
sz
)
:
hipMalloc
(
&
result
,
sz
);
if
(
status
!=
hipSuccess
)
if
(
status
!=
hipSuccess
)
{
{
if
(
host
)
if
(
host
)
...
@@ -59,6 +68,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
...
@@ -59,6 +68,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
else
else
return
allocate_gpu
(
sz
,
true
);
return
allocate_gpu
(
sz
,
true
);
}
}
assert
(
result
!=
nullptr
);
return
hip_ptr
{
result
};
return
hip_ptr
{
result
};
}
}
...
@@ -75,6 +85,8 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
...
@@ -75,6 +85,8 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
{
{
gpu_sync
();
gpu_sync
();
std
::
vector
<
T
>
result
(
sz
);
std
::
vector
<
T
>
result
(
sz
);
assert
(
not
is_device_ptr
(
result
.
data
()));
assert
(
is_device_ptr
(
x
));
auto
status
=
hipMemcpy
(
result
.
data
(),
x
,
sz
*
sizeof
(
T
),
hipMemcpyDeviceToHost
);
auto
status
=
hipMemcpy
(
result
.
data
(),
x
,
sz
*
sizeof
(
T
),
hipMemcpyDeviceToHost
);
if
(
status
!=
hipSuccess
)
if
(
status
!=
hipSuccess
)
MIGRAPHX_THROW
(
"Copy from gpu failed: "
+
hip_error
(
status
));
// NOLINT
MIGRAPHX_THROW
(
"Copy from gpu failed: "
+
hip_error
(
status
));
// NOLINT
...
@@ -85,6 +97,8 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
...
@@ -85,6 +97,8 @@ hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
{
{
gpu_sync
();
gpu_sync
();
auto
result
=
allocate_gpu
(
sz
,
host
);
auto
result
=
allocate_gpu
(
sz
,
host
);
assert
(
is_device_ptr
(
result
.
get
()));
assert
(
not
is_device_ptr
(
x
));
auto
status
=
hipMemcpy
(
result
.
get
(),
x
,
sz
,
hipMemcpyHostToDevice
);
auto
status
=
hipMemcpy
(
result
.
get
(),
x
,
sz
,
hipMemcpyHostToDevice
);
if
(
status
!=
hipSuccess
)
if
(
status
!=
hipSuccess
)
MIGRAPHX_THROW
(
"Copy to gpu failed: "
+
hip_error
(
status
));
MIGRAPHX_THROW
(
"Copy to gpu failed: "
+
hip_error
(
status
));
...
...
src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
View file @
2f268bc2
...
@@ -11,7 +11,7 @@ struct module;
...
@@ -11,7 +11,7 @@ struct module;
namespace
gpu
{
namespace
gpu
{
std
::
vector
<
stream_race
>
analyze_streams
(
const
module
&
p
);
std
::
vector
<
stream_race
>
analyze_streams
(
const
module
&
m
);
}
// namespace gpu
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
View file @
2f268bc2
...
@@ -35,6 +35,10 @@ struct code_object_op
...
@@ -35,6 +35,10 @@ struct code_object_op
f
(
self
.
output
,
"output"
));
f
(
self
.
output
,
"output"
));
}
}
value
attributes
()
const
{
return
{{
"group"
,
group
()}};
}
std
::
string
group
()
const
{
return
"gpu::code_object::"
+
symbol_name
;
}
std
::
string
name
()
const
{
return
"gpu::code_object"
;
}
std
::
string
name
()
const
{
return
"gpu::code_object"
;
}
shape
compute_shape
(
std
::
vector
<
shape
>
inputs
)
const
;
shape
compute_shape
(
std
::
vector
<
shape
>
inputs
)
const
;
argument
argument
...
...
src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
0 → 100644
View file @
2f268bc2
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#include <migraphx/config.hpp>
#include <string>
#include <unordered_map>
#include <vector>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
struct
shape
;
namespace
gpu
{
namespace
gen
{
struct
vectorize
{
std
::
size_t
size
=
1
;
std
::
size_t
axis
=
0
;
static
vectorize
elements
(
std
::
size_t
axis
,
const
std
::
vector
<
shape
>&
inputs
);
std
::
string
str
()
const
;
};
struct
preload
{
std
::
vector
<
bool
>
args
=
{};
static
preload
broadcasts
(
std
::
size_t
axis
,
const
std
::
vector
<
shape
>&
inputs
);
bool
is_preloading
()
const
;
std
::
string
str
()
const
;
};
std
::
size_t
find_fast_axis
(
const
std
::
vector
<
shape
>&
inputs
);
std
::
string
make_transformer_args
(
std
::
vector
<
std
::
string
>
transformers
);
template
<
class
...
Ts
>
std
::
string
make_transformer_args
(
Ts
...
xs
)
{
return
make_transformer_args
({
xs
.
str
()...});
}
}
// namespace gen
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
View file @
2f268bc2
...
@@ -31,6 +31,13 @@ struct hip_compile_options
...
@@ -31,6 +31,13 @@ struct hip_compile_options
void
set_launch_params
(
const
value
&
v
,
void
set_launch_params
(
const
value
&
v
,
const
std
::
function
<
std
::
size_t
(
std
::
size_t
local
)
>&
compute_global
,
const
std
::
function
<
std
::
size_t
(
std
::
size_t
local
)
>&
compute_global
,
std
::
size_t
default_local
=
1024
);
std
::
size_t
default_local
=
1024
);
void
set_launch_params
(
const
value
&
v
,
std
::
size_t
default_global
,
std
::
size_t
default_local
=
1024
)
{
set_launch_params
(
v
,
[
=
](
auto
)
{
return
default_global
;
},
default_local
);
}
};
};
/// Compute global for n elements, but max out on target-specific upper limit
/// Compute global for n elements, but max out on target-specific upper limit
...
@@ -39,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
...
@@ -39,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
operation
compile_hip_code_object
(
const
std
::
string
&
content
,
hip_compile_options
options
);
operation
compile_hip_code_object
(
const
std
::
string
&
content
,
hip_compile_options
options
);
std
::
size_t
compute_block_size
(
std
::
size_t
n
,
std
::
size_t
max_block_size
=
1024
);
}
// namespace gpu
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
}
// namespace migraphx
...
...
src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
View file @
2f268bc2
...
@@ -14,7 +14,7 @@ namespace gpu {
...
@@ -14,7 +14,7 @@ namespace gpu {
struct
eliminate_workspace
struct
eliminate_workspace
{
{
std
::
string
name
()
const
{
return
"eliminate_workspace"
;
}
std
::
string
name
()
const
{
return
"eliminate_workspace"
;
}
void
apply
(
module
&
p
)
const
;
void
apply
(
module
&
m
)
const
;
};
};
}
// namespace gpu
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
View file @
2f268bc2
...
@@ -16,7 +16,7 @@ struct fuse_ops
...
@@ -16,7 +16,7 @@ struct fuse_ops
context
*
ctx
=
nullptr
;
context
*
ctx
=
nullptr
;
bool
fast_math
=
true
;
bool
fast_math
=
true
;
std
::
string
name
()
const
{
return
"gpu::fuse_ops"
;
}
std
::
string
name
()
const
{
return
"gpu::fuse_ops"
;
}
void
apply
(
module
&
p
)
const
;
void
apply
(
module
&
m
)
const
;
};
};
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/gemm.hpp
View file @
2f268bc2
...
@@ -18,6 +18,8 @@ namespace gpu {
...
@@ -18,6 +18,8 @@ namespace gpu {
struct
context
;
struct
context
;
void
blas_shape
(
const
shape
&
s
);
template
<
class
Op
>
template
<
class
Op
>
struct
rocblas_gemm
struct
rocblas_gemm
{
{
...
@@ -50,13 +52,14 @@ struct rocblas_gemm
...
@@ -50,13 +52,14 @@ struct rocblas_gemm
std
::
vector
<
shape
>
in_shapes
(
inputs
);
std
::
vector
<
shape
>
in_shapes
(
inputs
);
in_shapes
.
pop_back
();
in_shapes
.
pop_back
();
check_shapes
{
in_shapes
,
*
this
}.
not_broadcasted
();
check_shapes
{
in_shapes
,
*
this
}.
not_broadcasted
();
b
atch_not_transposed
(
inputs
[
0
].
strides
()
);
b
las_shape
(
inputs
[
0
]
);
b
atch_not_transposed
(
inputs
[
1
].
strides
()
);
b
las_shape
(
inputs
[
1
]
);
// if gemm and add are fused
// if gemm and add are fused
if
(
not
float_equal
(
beta
,
0
)
)
if
(
in_shapes
.
size
()
>
2
)
{
{
auto
cmat_shape
=
in_shapes
.
back
();
auto
cmat_shape
=
in_shapes
.
back
();
in_shapes
.
pop_back
();
in_shapes
.
pop_back
();
blas_shape
(
cmat_shape
);
auto
op_out_shape
=
op
.
compute_shape
(
in_shapes
);
auto
op_out_shape
=
op
.
compute_shape
(
in_shapes
);
if
(
cmat_shape
.
lens
()
!=
op_out_shape
.
lens
())
if
(
cmat_shape
.
lens
()
!=
op_out_shape
.
lens
())
{
{
...
@@ -71,6 +74,7 @@ struct rocblas_gemm
...
@@ -71,6 +74,7 @@ struct rocblas_gemm
to_string
(
cmat_shape
.
type
())
+
to_string
(
cmat_shape
.
type
())
+
", it must be: "
+
to_string
(
op_out_shape
.
type
()));
", it must be: "
+
to_string
(
op_out_shape
.
type
()));
}
}
return
op_out_shape
;
}
}
return
op
.
compute_shape
(
in_shapes
);
return
op
.
compute_shape
(
in_shapes
);
...
@@ -96,28 +100,6 @@ struct rocblas_gemm
...
@@ -96,28 +100,6 @@ struct rocblas_gemm
return
args
.
back
();
return
args
.
back
();
}
}
void
batch_not_transposed
(
const
std
::
vector
<
std
::
size_t
>&
strides
)
const
{
if
(
strides
.
size
()
<=
2
)
return
;
auto
dim_0
=
strides
.
size
()
-
2
;
auto
matrix_size
=
std
::
max
(
strides
[
dim_0
],
strides
[
dim_0
+
1
]);
std
::
vector
<
std
::
size_t
>
batch
(
strides
.
begin
(),
strides
.
begin
()
+
dim_0
);
if
(
std
::
all_of
(
batch
.
begin
(),
batch
.
end
(),
[
&
](
auto
i
)
{
return
(
i
<
matrix_size
);
}))
{
MIGRAPHX_THROW
(
"GPU_GEMM: matrix size and batch size {"
+
to_string_range
(
strides
)
+
"} are transposed!"
);
}
if
(
std
::
adjacent_find
(
batch
.
begin
(),
batch
.
end
(),
[
&
](
auto
i
,
auto
j
)
{
return
(
i
<
j
or
i
<
matrix_size
or
j
<
matrix_size
);
})
!=
batch
.
end
())
{
MIGRAPHX_THROW
(
"GPU_GEMM: batch size {"
+
to_string_range
(
strides
)
+
"} is transposed!"
);
}
}
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
std
::
ptrdiff_t
output_alias
(
const
std
::
vector
<
shape
>&
shapes
)
const
{
{
return
shapes
.
size
()
-
1
;
return
shapes
.
size
()
-
1
;
...
...
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
0 → 100644
View file @
2f268bc2
#ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#include <migraphx/config.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
struct
module
;
namespace
gpu
{
struct
prefuse_ops
{
std
::
string
name
()
const
{
return
"gpu::prefuse_ops"
;
}
void
apply
(
module
&
m
)
const
;
};
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
View file @
2f268bc2
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
#include <migraphx/shape.hpp>
#include <migraphx/shape.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/miopen.hpp>
...
@@ -14,6 +15,7 @@ struct context;
...
@@ -14,6 +15,7 @@ struct context;
struct
miopen_quant_convolution
struct
miopen_quant_convolution
{
{
op
::
quant_convolution
op
;
op
::
quant_convolution
op
;
bool
int8_x4_format
=
false
;
shared
<
convolution_descriptor
>
cd
;
shared
<
convolution_descriptor
>
cd
;
miopenConvFwdAlgorithm_t
algo
{};
miopenConvFwdAlgorithm_t
algo
{};
miopenHandle_t
handle
=
nullptr
;
miopenHandle_t
handle
=
nullptr
;
...
@@ -22,7 +24,8 @@ struct miopen_quant_convolution
...
@@ -22,7 +24,8 @@ struct miopen_quant_convolution
static
auto
reflect
(
Self
&
self
,
F
f
)
static
auto
reflect
(
Self
&
self
,
F
f
)
{
{
// TODO: Add algo
// TODO: Add algo
return
op
::
quant_convolution
::
reflect
(
self
.
op
,
f
);
return
pack_join
(
migraphx
::
reflect
(
self
.
op
,
f
),
pack
(
f
(
self
.
int8_x4_format
,
"int8_x4_format"
)));
}
}
std
::
string
name
()
const
{
return
"gpu::quant_convolution"
;
}
std
::
string
name
()
const
{
return
"gpu::quant_convolution"
;
}
...
...
src/targets/gpu/include/migraphx/gpu/scatter.hpp
View file @
2f268bc2
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include <migraphx/argument.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/op/scatter.hpp>
#include <migraphx/op/scatter
_none
.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/miopen.hpp>
namespace
migraphx
{
namespace
migraphx
{
...
@@ -14,7 +14,9 @@ struct context;
...
@@ -14,7 +14,9 @@ struct context;
struct
hip_scatter
struct
hip_scatter
{
{
op
::
scatter
op
;
// scatter_none is an exact replacement for previous op::scatter,
// renamed to match an Onnx option. Don't use base class op::scatter
op
::
scatter_none
op
;
template
<
class
Self
,
class
F
>
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
static
auto
reflect
(
Self
&
self
,
F
f
)
...
@@ -22,7 +24,7 @@ struct hip_scatter
...
@@ -22,7 +24,7 @@ struct hip_scatter
return
migraphx
::
reflect
(
self
.
op
,
f
);
return
migraphx
::
reflect
(
self
.
op
,
f
);
}
}
std
::
string
name
()
const
{
return
"gpu::scatter"
;
}
std
::
string
name
()
const
{
return
"gpu::scatter
_none
"
;
}
shape
compute_shape
(
std
::
vector
<
shape
>
inputs
)
const
;
shape
compute_shape
(
std
::
vector
<
shape
>
inputs
)
const
;
argument
argument
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
;
compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
)
const
;
...
...
src/targets/gpu/include/migraphx/gpu/schedule_model.hpp
View file @
2f268bc2
...
@@ -17,9 +17,9 @@ struct schedule_model
...
@@ -17,9 +17,9 @@ struct schedule_model
{
{
std
::
size_t
streams
=
0
;
std
::
size_t
streams
=
0
;
std
::
size_t
concurrency
()
const
;
std
::
size_t
concurrency
()
const
;
void
sched
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
n
)
const
;
void
sched
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
n
)
const
;
void
wait
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
void
wait
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
void
record
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
void
record
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
std
::
size_t
weight
(
const
operation
&
op
)
const
;
std
::
size_t
weight
(
const
operation
&
op
)
const
;
};
};
...
...
src/targets/gpu/include/migraphx/gpu/sync_device.hpp
View file @
2f268bc2
...
@@ -15,7 +15,7 @@ namespace gpu {
...
@@ -15,7 +15,7 @@ namespace gpu {
struct
sync_device
struct
sync_device
{
{
std
::
string
name
()
const
{
return
"sync_device"
;
}
std
::
string
name
()
const
{
return
"sync_device"
;
}
void
apply
(
module
&
p
)
const
;
void
apply
(
module
&
m
)
const
;
};
};
}
// namespace gpu
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/gpu/include/migraphx/gpu/write_literals.hpp
View file @
2f268bc2
...
@@ -14,7 +14,7 @@ struct write_literals
...
@@ -14,7 +14,7 @@ struct write_literals
context
*
ctx
=
nullptr
;
context
*
ctx
=
nullptr
;
std
::
string
name
()
const
{
return
"gpu::write_literals"
;
}
std
::
string
name
()
const
{
return
"gpu::write_literals"
;
}
void
apply
(
module
&
p
)
const
;
void
apply
(
module
&
m
)
const
;
};
};
}
// namespace gpu
}
// namespace gpu
...
...
src/targets/gpu/jit/gathernd.cpp
0 → 100644
View file @
2f268bc2
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/module.hpp>
#include <migraphx/pass_manager.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
// NOLINTNEXTLINE
static
const
char
*
const
gathernd_kernel
=
R"__migraphx__(
#include <migraphx/kernels/gathernd.hpp>
#include <migraphx/kernels/ops.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/generic_constant.hpp>
#include <args.hpp>
namespace migraphx {
extern "C" {
__global__ void gathernd_kernel(void* in_data, void* in_indices, void* output)
{
make_tensors()(in_data, in_indices, output)([](auto&&... xs) {
auto settings = make_gathernd_settings(MIGRAPHX_MAKE_CONSTANT(int64_t{BATCH_DIMS}));
gathernd(xs..., settings);
});
}
}
} // namespace migraphx
)__migraphx__"
;
struct
gathernd_compiler
:
compiler
<
gathernd_compiler
>
{
std
::
vector
<
std
::
string
>
names
()
const
{
return
{
"gathernd"
};
}
operation
compile_op
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
const
value
&
v
)
const
{
hip_compile_options
options
;
auto
out_s
=
inputs
.
back
();
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
out_s
.
elements
()));
options
.
inputs
=
inputs
;
options
.
output
=
out_s
;
options
.
kernel_name
=
"gathernd_kernel"
;
options
.
virtual_inputs
=
inputs
;
// batch_dims
assert
(
v
.
contains
(
"batch_dims"
));
auto
batch_dims
=
v
.
at
(
"batch_dims"
).
to
<
int64_t
>
();
options
.
params
+=
" -DBATCH_DIMS="
+
std
::
to_string
(
batch_dims
);
return
compile_hip_code_object
(
gathernd_kernel
,
options
);
}
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
const
{
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
op
.
to_value
()));
}
};
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
Prev
1
2
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment