Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
5af9aac0
Commit
5af9aac0
authored
Feb 21, 2023
by
charlie
Browse files
Merge branch 'dyn_batch_pass' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_test_runner
parents
7b2516e0
05e81ed3
Changes
147
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
383 additions
and
199 deletions
+383
-199
src/py/migraphx_py.cpp
src/py/migraphx_py.cpp
+11
-5
src/replace_allocate.cpp
src/replace_allocate.cpp
+7
-9
src/shape.cpp
src/shape.cpp
+11
-0
src/simplify_algebra.cpp
src/simplify_algebra.cpp
+11
-4
src/split_single_dyn_dim.cpp
src/split_single_dyn_dim.cpp
+142
-0
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+51
-68
src/targets/gpu/compile_hip.cpp
src/targets/gpu/compile_hip.cpp
+34
-50
src/targets/gpu/compile_hip_code_object.cpp
src/targets/gpu/compile_hip_code_object.cpp
+1
-1
src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
...targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+0
-6
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
...targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+3
-5
src/targets/gpu/fuse_ops.cpp
src/targets/gpu/fuse_ops.cpp
+53
-28
src/targets/gpu/include/migraphx/gpu/context.hpp
src/targets/gpu/include/migraphx/gpu/context.hpp
+9
-2
src/targets/gpu/include/migraphx/gpu/convolution.hpp
src/targets/gpu/include/migraphx/gpu/convolution.hpp
+4
-3
src/targets/gpu/include/migraphx/gpu/miopen.hpp
src/targets/gpu/include/migraphx/gpu/miopen.hpp
+11
-4
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+2
-2
src/targets/gpu/jit/reduce.cpp
src/targets/gpu/jit/reduce.cpp
+19
-10
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+1
-1
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+4
-0
src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
+1
-1
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
...rgets/gpu/kernels/include/migraphx/kernels/functional.hpp
+8
-0
No files found.
src/py/migraphx_py.cpp
View file @
5af9aac0
...
...
@@ -345,15 +345,21 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
.
def
(
"is_compiled"
,
&
migraphx
::
program
::
is_compiled
)
.
def
(
"compile"
,
[](
migraphx
::
program
&
p
,
const
migraphx
::
target
&
t
,
bool
offload_copy
,
bool
fast_math
)
{
[](
migraphx
::
program
&
p
,
const
migraphx
::
target
&
t
,
bool
offload_copy
,
bool
fast_math
,
bool
exhaustive_tune
)
{
migraphx
::
compile_options
options
;
options
.
offload_copy
=
offload_copy
;
options
.
fast_math
=
fast_math
;
options
.
offload_copy
=
offload_copy
;
options
.
fast_math
=
fast_math
;
options
.
exhaustive_tune
=
exhaustive_tune
;
p
.
compile
(
t
,
options
);
},
py
::
arg
(
"t"
),
py
::
arg
(
"offload_copy"
)
=
true
,
py
::
arg
(
"fast_math"
)
=
true
)
py
::
arg
(
"offload_copy"
)
=
true
,
py
::
arg
(
"fast_math"
)
=
true
,
py
::
arg
(
"exhaustive_tune"
)
=
false
)
.
def
(
"get_main_module"
,
[](
const
migraphx
::
program
&
p
)
{
return
p
.
get_main_module
();
})
.
def
(
"create_module"
,
...
...
src/replace_allocate.cpp
View file @
5af9aac0
...
...
@@ -104,19 +104,17 @@ void replace_allocate::apply(module& m) const
continue
;
auto
s
=
ins
->
get_shape
();
if
(
not
main_offload_copy
and
model
.
needs_out_params
()
and
contains
(
mod_output_names
,
ins
))
if
(
not
main_offload_copy
and
not
(
m
.
use_local_alloc
)
and
model
.
needs_out_params
()
and
contains
(
mod_output_names
,
ins
))
{
auto
out_param
=
m
.
add_parameter
(
mod_output_names
[
ins
],
s
);
m
.
replace_instruction
(
ins
,
out_param
);
continue
;
}
m
.
replace_instruction
(
ins
,
m
.
insert_instruction
(
ins
,
make_op
(
model
.
name
(),
migraphx
::
value
{{
"shape"
,
to_value
(
s
)}})));
else
{
m
.
replace_instruction
(
ins
,
make_op
(
model
.
name
(),
migraphx
::
value
{{
"shape"
,
to_value
(
s
)}}));
}
}
}
...
...
src/shape.cpp
View file @
5af9aac0
...
...
@@ -483,6 +483,17 @@ std::string shape::type_string() const { return name(this->type()); }
bool
shape
::
dynamic
()
const
{
return
not
impl
->
m_dyn_dims
.
empty
();
}
bool
shape
::
any_of_dynamic
()
const
{
if
(
this
->
dynamic
())
{
return
true
;
}
return
std
::
any_of
(
this
->
sub_shapes
().
cbegin
(),
this
->
sub_shapes
().
cend
(),
[](
auto
s
)
{
return
s
.
any_of_dynamic
();
});
}
const
std
::
vector
<
shape
::
dynamic_dimension
>&
shape
::
dyn_dims
()
const
{
return
impl
->
m_dyn_dims
;
}
std
::
vector
<
std
::
size_t
>
shape
::
min_lens
()
const
...
...
src/simplify_algebra.cpp
View file @
5af9aac0
...
...
@@ -31,6 +31,7 @@
#include <migraphx/op/reshape.hpp>
#include <migraphx/op/transpose.hpp>
#include <migraphx/matcher.hpp>
#include <migraphx/common.hpp>
#include <migraphx/literal.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/serialize.hpp>
...
...
@@ -340,12 +341,18 @@ struct find_inner_broadcast
std
::
back_inserter
(
inputs
),
[](
auto
i
)
{
return
i
->
inputs
().
front
();
});
if
(
std
::
any_of
(
inputs
.
begin
(),
inputs
.
end
(),
[
&
](
auto
i
)
{
return
i
->
get_shape
()
!=
inputs
.
front
()
->
get_shape
();
return
i
->
get_shape
()
!=
inputs
.
front
()
->
get_shape
()
and
i
->
get_shape
().
elements
()
!=
1
;
}))
return
;
auto
op
=
m
.
insert_instruction
(
ins
,
ins
->
get_operator
(),
inputs
);
m
.
replace_instruction
(
ins
,
broadcasts
.
front
()
->
get_operator
(),
op
);
auto
b_it
=
std
::
find_if
(
broadcasts
.
begin
(),
broadcasts
.
end
(),
[
&
](
auto
i
)
{
return
not
i
->
get_shape
().
scalar
();
});
if
(
b_it
==
broadcasts
.
end
())
b_it
=
broadcasts
.
begin
();
auto
op
=
insert_common_op
(
m
,
ins
,
ins
->
get_operator
(),
inputs
);
m
.
replace_instruction
(
ins
,
(
*
b_it
)
->
get_operator
(),
op
);
}
};
...
...
@@ -975,7 +982,7 @@ struct find_neg_unit_ops
auto
ins
=
r
.
result
;
auto
c_in
=
r
.
instructions
[
"x"
];
auto
neg
=
m
.
add
_instruction
(
make_op
(
"neg"
),
c_in
);
auto
neg
=
m
.
insert
_instruction
(
ins
,
make_op
(
"neg"
),
c_in
);
m
.
replace_instruction
(
ins
,
neg
);
}
};
...
...
src/split_single_dyn_dim.cpp
0 → 100644
View file @
5af9aac0
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/split_single_dyn_dim.hpp>
#include <migraphx/functional.hpp>
#include <migraphx/pass_manager.hpp>
#include <migraphx/make_op.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
bool
has_one_dyn_dim
(
std
::
unordered_map
<
std
::
string
,
shape
>
param_shapes
,
std
::
string
&
dyn_param_str
,
int
&
dyn_index
,
int
&
min_dim
,
int
&
max_dim
)
{
// true if parameters contain exactly one dynamic shape with exactly one non-fixed
// dynamic_dimension
if
(
std
::
none_of
(
param_shapes
.
cbegin
(),
param_shapes
.
cend
(),
[](
auto
ps
)
{
return
ps
.
second
.
dynamic
();
}))
return
false
;
int
num_dynamic
=
0
;
std
::
string
out_str
;
int
tmp_min
=
-
1
;
int
tmp_max
=
-
1
;
int
tmp_ind
=
-
1
;
for
(
auto
ps
:
param_shapes
)
{
if
(
ps
.
second
.
dynamic
())
{
num_dynamic
+=
1
;
if
(
num_dynamic
>
1
)
{
return
false
;
}
int
num_nf
=
0
;
auto
dds
=
ps
.
second
.
dyn_dims
();
for
(
int
i
=
0
;
i
<
dds
.
size
();
++
i
)
{
const
auto
&
dd
=
dds
.
at
(
i
);
if
(
not
dd
.
is_fixed
())
{
num_nf
+=
1
;
tmp_min
=
dd
.
min
;
tmp_max
=
dd
.
max
;
tmp_ind
=
i
;
}
}
if
(
num_nf
==
1
)
{
out_str
=
ps
.
first
;
}
else
{
return
false
;
}
}
}
min_dim
=
tmp_min
;
max_dim
=
tmp_max
;
dyn_index
=
tmp_ind
;
dyn_param_str
=
out_str
;
return
true
;
}
/**
* Make all the batch sizes in the range for now.
* Probably won't work for `if` and `loop` instructions, depending on how the submodules for those
* work create additional submodules for optimal values if not already done insert select_module
* instruction to the top, replace return bypassing other instructions. Unused instructions should
* be removed by dead_code_elimination
*/
void
split_single_dyn_dim
::
apply
(
module_pass_manager
&
mpm
)
const
{
module_ref
mm
=
&
mpm
.
get_module
();
auto
param_names
=
mm
->
get_parameter_names
();
auto
param_shapes
=
mm
->
get_parameter_shapes
();
std
::
string
dyn_param_name
;
int
dyn_index
;
int
min_dim
;
int
max_dim
;
if
(
has_one_dyn_dim
(
param_shapes
,
dyn_param_name
,
dyn_index
,
min_dim
,
max_dim
))
{
const
auto
&
dyn_param
=
mm
->
get_parameter
(
dyn_param_name
);
auto
dyn_param_shape
=
mm
->
get_parameter_shape
(
dyn_param_name
);
std
::
vector
<
module_ref
>
submodules
;
// create submodules for each dimension size
for
(
int
dim_size
=
min_dim
;
dim_size
<=
max_dim
;
++
dim_size
)
{
auto
submod
=
mpm
.
create_module
(
"batch_"
+
std
::
to_string
(
dim_size
));
// instruction map for new static submodule parameters
std
::
unordered_map
<
instruction_ref
,
instruction_ref
>
map_ins
;
// create static shape using dim_size
auto
static_lens
=
dyn_param_shape
.
max_lens
();
static_lens
.
at
(
dyn_index
)
=
dim_size
;
auto
static_param
=
submod
->
add_parameter
(
dyn_param_name
,
migraphx
::
shape
{
dyn_param_shape
.
type
(),
static_lens
});
map_ins
[
dyn_param
]
=
static_param
;
auto
outputs
=
submod
->
add_instructions
(
mm
,
map_ins
);
submod
->
add_return
({
outputs
});
submodules
.
push_back
(
submod
);
}
// redirect to select_module operator and return
std
::
vector
<
instruction_ref
>
sm_inputs
;
std
::
transform
(
param_names
.
cbegin
(),
param_names
.
cend
(),
std
::
back_inserter
(
sm_inputs
),
[
&
](
auto
pn
)
{
return
mm
->
get_parameter
(
pn
);
});
migraphx
::
shape
out_attr
=
migraphx
::
shape
{
mm
->
get_output_shapes
()};
auto
sm_ins
=
mm
->
add_instruction
(
migraphx
::
make_op
(
"select_module"
,
{{
"output_dyn_shapes"
,
migraphx
::
to_value
(
out_attr
)}}),
sm_inputs
,
submodules
);
mm
->
replace_return
({
sm_ins
});
}
}
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/CMakeLists.txt
View file @
5af9aac0
#####################################################################################
#
####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
...
...
@@ -20,9 +20,9 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
#
####################################################################################
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip
/opt/rocm/hcc
)
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip
)
find_package
(
miopen
)
# rocblas
...
...
@@ -33,6 +33,8 @@ if(NOT TARGET MIOpen)
message
(
SEND_ERROR
"Cant find miopen"
)
endif
()
set
(
MIGRAPHX_USE_HIPRTC OFF CACHE BOOL
"Use hipRTC APIs"
)
include
(
Embed
)
file
(
GLOB KERNEL_FILES
${
CONFIGURE_DEPENDS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/migraphx/kernels/*.hpp
)
...
...
@@ -46,9 +48,10 @@ add_library(compile_for_gpu INTERFACE)
target_compile_options
(
compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns
)
target_link_libraries
(
compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored
)
check_cxx_compiler_flag
(
"--cuda-host-only -fhip-lambda-host-device -x hip"
HAS_HIP_LAMBDA_HOST_DEVICE
)
if
(
HAS_HIP_LAMBDA_HOST_DEVICE
)
message
(
STATUS
"Enable -fhip-lambda-host-device"
)
target_compile_options
(
compile_for_gpu INTERFACE -fhip-lambda-host-device
)
message
(
STATUS
"Enable -fhip-lambda-host-device"
)
target_compile_options
(
compile_for_gpu INTERFACE -fhip-lambda-host-device
)
endif
()
set_target_properties
(
migraphx_device PROPERTIES EXPORT_NAME device
)
...
...
@@ -60,11 +63,13 @@ target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURR
target_include_directories
(
migraphx_device PRIVATE $<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/device/include>
)
add_library
(
kernel_file_check EXCLUDE_FROM_ALL
)
foreach
(
KERNEL_FILE
${
KERNEL_FILES
}
)
get_filename_component
(
KERNEL_BASE_FILE
${
KERNEL_FILE
}
NAME_WE
)
file
(
WRITE
${
CMAKE_CURRENT_BINARY_DIR
}
/kernels/include/migraphx/kernels/
${
KERNEL_BASE_FILE
}
.cpp
"#include <migraphx/kernels/
${
KERNEL_BASE_FILE
}
.hpp>
\n
"
)
target_sources
(
kernel_file_check PRIVATE
${
CMAKE_CURRENT_BINARY_DIR
}
/kernels/include/migraphx/kernels/
${
KERNEL_BASE_FILE
}
.cpp
)
endforeach
()
target_compile_definitions
(
kernel_file_check PRIVATE -DMIGRAPHX_NLOCAL=256
)
target_include_directories
(
kernel_file_check PRIVATE $<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/kernels/include/>
)
target_link_libraries
(
kernel_file_check compile_for_gpu
)
...
...
@@ -125,6 +130,7 @@ function(register_migraphx_gpu_ops PREFIX)
register_op
(
migraphx_gpu HEADER migraphx/gpu/
${
OP
}
.hpp OPERATORS gpu::
${
PREFIX
}${
OP
}
INCLUDES migraphx/gpu/context.hpp
)
endforeach
()
endfunction
()
register_migraphx_gpu_ops
(
hip_
argmax
argmin
...
...
@@ -146,47 +152,26 @@ register_migraphx_gpu_ops(miopen_
lrn
pooling
)
register_op
(
migraphx_gpu
HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
register_op
(
migraphx_gpu
HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
INCLUDES migraphx/gpu/context.hpp
)
register_op
(
migraphx_gpu
HEADER migraphx/gpu/int8_gemm_pack.hpp
register_op
(
migraphx_gpu
HEADER migraphx/gpu/int8_gemm_pack.hpp
OPERATORS gpu::hip_int8_gemm_pack_a gpu::hip_int8_gemm_pack_b
INCLUDES migraphx/gpu/context.hpp
)
register_op
(
migraphx_gpu
HEADER migraphx/gpu/gemm.hpp
register_op
(
migraphx_gpu
HEADER migraphx/gpu/gemm.hpp
OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
INCLUDES migraphx/gpu/context.hpp
)
register_op
(
migraphx_gpu HEADER migraphx/gpu/convolution.hpp
register_op
(
migraphx_gpu HEADER migraphx/gpu/convolution.hpp
OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::deconvolution> gpu::miopen_convolution<op::quant_convolution>
INCLUDES migraphx/gpu/context.hpp
)
rocm_set_soversion
(
migraphx_gpu
${
MIGRAPHX_SO_VERSION
}
)
rocm_clang_tidy_check
(
migraphx_gpu
)
# look for offload bundler
get_filename_component
(
CMAKE_CXX_COMPILER_PATH
"
${
CMAKE_CXX_COMPILER
}
"
PATH
)
if
(
CMAKE_CXX_COMPILER MATCHES
".*clang
\\
+
\\
+$"
)
find_program
(
MIGRAPHX_OFFLOADBUNDLER_BIN clang-offload-bundler
HINTS
${
CMAKE_CXX_COMPILER_PATH
}
PATH_SUFFIXES bin
PATHS /opt/rocm/llvm
)
else
()
find_program
(
MIGRAPHX_EXTRACT_KERNEL extractkernel
PATH_SUFFIXES bin
HINTS
${
CMAKE_CXX_COMPILER_PATH
}
PATHS
/opt/rocm/hip
/opt/rocm/hcc
/opt/rocm
)
endif
()
message
(
STATUS
"clang-offload-bundler:
${
MIGRAPHX_OFFLOADBUNDLER_BIN
}
"
)
message
(
STATUS
"extractkernel:
${
MIGRAPHX_EXTRACT_KERNEL
}
"
)
set
(
MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL
""
)
if
(
MIGRAPHX_ENABLE_MLIR
)
# Find package rocMLIR
find_package
(
rocMLIR 1.0.0 CONFIG REQUIRED
)
...
...
@@ -195,36 +180,38 @@ if(MIGRAPHX_ENABLE_MLIR)
target_link_libraries
(
migraphx_gpu PUBLIC rocMLIR::rockCompiler
)
endif
()
set
(
MIGRAPHX_USE_HIPRTC OFF CACHE BOOL
""
)
if
(
MIGRAPHX_USE_HIPRTC
)
target_compile_definitions
(
migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1
)
message
(
STATUS
"MIGraphX is using hipRTC"
)
target_compile_definitions
(
migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1
)
else
()
# Get flags needed to compile hip
include
(
TargetFlags
)
target_flags
(
HIP_COMPILER_FLAGS hip::device
)
# Remove cuda arch flags
string
(
REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REGEX REPLACE --offload-arch=[a-z0-9:+-]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
# Skip library paths since hip will incorrectly treat it as a source file
string
(
APPEND HIP_COMPILER_FLAGS
" "
)
foreach
(
_unused RANGE 2
)
string
(
REGEX REPLACE
" /[^ ]+
\\
.(a|so) "
" "
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
endforeach
()
message
(
STATUS
"MIGraphX is using HIP Clang"
)
message
(
STATUS
"Hip compiler flags:
${
HIP_COMPILER_FLAGS
}
"
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"-DMIGRAPHX_HIP_COMPILER=
${
CMAKE_CXX_COMPILER
}
"
"-DMIGRAPHX_HIP_COMPILER_FLAGS=
${
HIP_COMPILER_FLAGS
}
"
"-DMIGRAPHX_OFFLOADBUNDLER_BIN=
${
MIGRAPHX_OFFLOADBUNDLER_BIN
}
"
"-DMIGRAPHX_EXTRACT_KERNEL=
${
MIGRAPHX_EXTRACT_KERNEL
}
"
"-DMIGRAPHX_USE_HIPRTC=0"
)
if
(
DEFINED CMAKE_CXX_COMPILER_LAUNCHER
)
execute_process
(
COMMAND which
${
CMAKE_CXX_COMPILER_LAUNCHER
}
OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER
)
string
(
STRIP
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"-DMIGRAPHX_HIP_COMPILER_LAUNCHER=
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
)
endif
()
# Get flags needed to compile hip
include
(
TargetFlags
)
target_flags
(
HIP_COMPILER_FLAGS hip::device
)
# Remove cuda arch flags
string
(
REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REGEX REPLACE --offload-arch=[a-z0-9:+-]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
# Skip library paths since hip will incorrectly treat it as a source file
string
(
APPEND HIP_COMPILER_FLAGS
" "
)
foreach
(
_unused RANGE 2
)
string
(
REGEX REPLACE
" /[^ ]+
\\
.(a|so) "
" "
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
endforeach
()
message
(
STATUS
"Hip compiler flags:
${
HIP_COMPILER_FLAGS
}
"
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"-DMIGRAPHX_HIP_COMPILER=
${
CMAKE_CXX_COMPILER
}
"
"-DMIGRAPHX_HIP_COMPILER_FLAGS=
${
HIP_COMPILER_FLAGS
}
"
)
if
(
DEFINED CMAKE_CXX_COMPILER_LAUNCHER
)
execute_process
(
COMMAND which
${
CMAKE_CXX_COMPILER_LAUNCHER
}
OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER
)
string
(
STRIP
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"-DMIGRAPHX_HIP_COMPILER_LAUNCHER=
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
)
endif
()
endif
()
# Check miopen find mode api
...
...
@@ -233,10 +220,9 @@ get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
check_library_exists
(
MIOpen
"miopenHiddenSetConvolutionFindMode"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_MODE_API
)
check_library_exists
(
MIOpen
"miopenFindSolutions"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_2_API
)
# TODO: Set default to HAS_FIND_2_API
set
(
MIGRAPHX_USE_FIND_2_API OFF CACHE BOOL
""
)
set
(
MIGRAPHX_USE_FIND_2_API
"
${
HAS_FIND_2_API
}
"
CACHE BOOL
""
)
if
(
MIGRAPHX_USE_FIND_2_API
)
if
(
MIGRAPHX_USE_FIND_2_API
)
target_compile_definitions
(
migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API
)
message
(
STATUS
"MIGraphx is using Find-2.0 API of MIOpen"
)
else
()
...
...
@@ -250,16 +236,13 @@ else()
message
(
STATUS
"MIOpen does not have find mode api"
)
endif
()
# Workaround broken rocblas headers
target_compile_definitions
(
migraphx_gpu PUBLIC -D__HIP_PLATFORM_HCC__=1
)
target_link_libraries
(
migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas
)
target_link_libraries
(
migraphx_gpu PRIVATE migraphx_device migraphx_kernels
)
add_subdirectory
(
driver
)
rocm_install_targets
(
TARGETS migraphx_gpu migraphx_device compile_for_gpu
INCLUDE
TARGETS migraphx_gpu migraphx_device compile_for_gpu
INCLUDE
${
CMAKE_CURRENT_SOURCE_DIR
}
/include
)
src/targets/gpu/compile_hip.cpp
View file @
5af9aac0
...
...
@@ -29,10 +29,9 @@
#include <cassert>
#include <iostream>
#if MIGRAPHX_USE_HIPRTC
#if
def
MIGRAPHX_USE_HIPRTC
#include <hip/hiprtc.h>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/env.hpp>
#else
#include <migraphx/compile_src.hpp>
#include <migraphx/process.hpp>
...
...
@@ -48,9 +47,10 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_GPU_DUMP_ASM
);
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_GPU_DUMP_SRC
);
#if MIGRAPHX_USE_HIPRTC
#if
def
MIGRAPHX_USE_HIPRTC
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_TRACE_HIPRTC
)
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_TRACE_HIPRTC
);
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS
);
std
::
string
hiprtc_error
(
hiprtcResult
err
,
const
std
::
string
&
msg
)
{
...
...
@@ -143,25 +143,29 @@ struct hiprtc_program
options
.
end
(),
std
::
back_inserter
(
c_options
),
[](
const
std
::
string
&
s
)
{
return
s
.
c_str
();
});
auto
result
=
hiprtcCompileProgram
(
prog
.
get
(),
c_options
.
size
(),
c_options
.
data
());
std
::
cerr
<<
log
()
<<
std
::
endl
;
auto
result
=
hiprtcCompileProgram
(
prog
.
get
(),
c_options
.
size
(),
c_options
.
data
());
auto
prog_log
=
log
();
if
(
not
prog_log
.
empty
())
{
std
::
cerr
<<
prog_log
<<
std
::
endl
;
}
if
(
result
!=
HIPRTC_SUCCESS
)
MIGRAPHX_HIPRTC_THROW
(
result
,
"Compilation failed."
);
}
std
::
string
log
()
std
::
string
log
()
const
{
std
::
size_t
n
=
0
;
MIGRAPHX_HIPRTC
(
hiprtcGetProgramLogSize
(
prog
.
get
(),
&
n
));
if
(
n
<
2
)
if
(
n
==
0
)
return
{};
std
::
vector
<
char
>
buffer
(
n
);
std
::
string
buffer
(
n
,
'\0'
);
MIGRAPHX_HIPRTC
(
hiprtcGetProgramLog
(
prog
.
get
(),
buffer
.
data
()));
assert
(
buffer
.
back
()
=
=
0
);
return
{
buffer
.
begin
(),
buffer
.
end
()
-
1
}
;
assert
(
buffer
.
back
()
!
=
0
);
return
buffer
;
}
std
::
vector
<
char
>
get_code_obj
()
std
::
vector
<
char
>
get_code_obj
()
const
{
std
::
size_t
n
=
0
;
MIGRAPHX_HIPRTC
(
hiprtcGetCodeSize
(
prog
.
get
(),
&
n
));
...
...
@@ -176,6 +180,17 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
{
hiprtc_program
prog
(
srcs
);
auto
options
=
split_string
(
params
,
' '
);
options
.
push_back
(
"-DMIGRAPHX_USE_HIPRTC=1"
);
// remove following three compilation flags for HIPRTC once fixes from hipRTC are available in
if
(
enabled
(
MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS
{}))
{
options
.
push_back
(
"-DMIGRAPHX_HAS_DPP=0"
);
options
.
push_back
(
"-DMIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=1"
);
options
.
push_back
(
"-Wno-reserved-identifier"
);
options
.
push_back
(
"-Wno-gnu-line-marker"
);
options
.
push_back
(
"-Wno-old-style-cast"
);
}
if
(
enabled
(
MIGRAPHX_GPU_DEBUG
{}))
options
.
push_back
(
"-DMIGRAPHX_DEBUG"
);
if
(
std
::
none_of
(
options
.
begin
(),
options
.
end
(),
[](
const
std
::
string
&
s
)
{
...
...
@@ -183,7 +198,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
}))
options
.
push_back
(
"-std=c++17"
);
options
.
push_back
(
"-fno-gpu-rdc"
);
options
.
push_back
(
"
-O"
+
string_value_of
(
MIGRAPHX_GPU_OPTIMIZE
{},
"3"
));
options
.
push_back
(
"-O"
+
string_value_of
(
MIGRAPHX_GPU_OPTIMIZE
{},
"3"
));
options
.
push_back
(
"-Wno-cuda-compat"
);
options
.
push_back
(
"--offload-arch="
+
arch
);
prog
.
compile
(
options
);
...
...
@@ -192,12 +207,6 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
#else // MIGRAPHX_USE_HIPRTC
bool
is_hcc_compiler
()
{
static
const
auto
result
=
ends_with
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
),
"hcc"
);
return
result
;
}
bool
is_hip_clang_compiler
()
{
static
const
auto
result
=
ends_with
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
),
"clang++"
);
...
...
@@ -221,7 +230,7 @@ std::vector<std::vector<char>>
compile_hip_src
(
const
std
::
vector
<
src_file
>&
srcs
,
std
::
string
params
,
const
std
::
string
&
arch
)
{
assert
(
not
srcs
.
empty
());
if
(
not
is_hcc_compiler
()
and
not
is_hip_clang_compiler
())
if
(
not
is_hip_clang_compiler
())
MIGRAPHX_THROW
(
"Unknown hip compiler: "
+
std
::
string
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
)));
...
...
@@ -231,16 +240,9 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
if
(
enabled
(
MIGRAPHX_GPU_DEBUG_SYM
{}))
params
+=
" -g"
;
params
+=
" -c"
;
if
(
is_hcc_compiler
())
{
params
+=
" -amdgpu-target="
+
arch
;
}
else
if
(
is_hip_clang_compiler
())
{
params
+=
" --offload-arch="
+
arch
;
params
+=
" --cuda-device-only"
;
params
+=
" -O"
+
string_value_of
(
MIGRAPHX_GPU_OPTIMIZE
{},
"3"
)
+
" "
;
}
params
+=
" --offload-arch="
+
arch
;
params
+=
" --cuda-device-only"
;
params
+=
" -O"
+
string_value_of
(
MIGRAPHX_GPU_OPTIMIZE
{},
"3"
)
+
" "
;
if
(
enabled
(
MIGRAPHX_GPU_DEBUG
{}))
params
+=
" -DMIGRAPHX_DEBUG"
;
...
...
@@ -255,24 +257,6 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
if
(
has_compiler_launcher
())
compiler
.
launcher
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER_LAUNCHER
);
#endif
if
(
is_hcc_compiler
())
compiler
.
process
=
[
&
](
const
fs
::
path
&
obj_path
)
->
fs
::
path
{
process
{
MIGRAPHX_STRINGIZE
(
MIGRAPHX_EXTRACT_KERNEL
)
+
std
::
string
{
" -i "
}
+
obj_path
.
string
()}
.
cwd
(
obj_path
.
parent_path
());
for
(
const
auto
&
entry
:
fs
::
directory_iterator
{
obj_path
.
parent_path
()})
{
const
auto
&
hsaco_path
=
entry
.
path
();
if
(
not
fs
::
is_regular_file
(
hsaco_path
))
continue
;
if
(
hsaco_path
.
extension
()
!=
".hsaco"
)
continue
;
return
hsaco_path
;
}
MIGRAPHX_THROW
(
"Missing hsaco"
);
};
if
(
enabled
(
MIGRAPHX_GPU_DUMP_SRC
{}))
{
for
(
const
auto
&
src
:
srcs
)
...
...
@@ -292,6 +276,8 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
return
{
compiler
.
compile
(
srcs
)};
}
#endif // MIGRAPHX_USE_HIPRTC
std
::
string
enum_params
(
std
::
size_t
count
,
std
::
string
param
)
{
std
::
vector
<
std
::
string
>
items
(
count
);
...
...
@@ -299,8 +285,6 @@ std::string enum_params(std::size_t count, std::string param)
return
join_strings
(
items
,
","
);
}
#endif // MIGRAPHX_USE_HIPRTC
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/compile_hip_code_object.cpp
View file @
5af9aac0
...
...
@@ -29,7 +29,6 @@
#include <migraphx/context.hpp>
#include <migraphx_kernels.hpp>
#include <migraphx/stringutils.hpp>
#include <hip/hip_runtime_api.h>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -80,6 +79,7 @@ std::string generate_args_hpp(const std::vector<shape>& inputs)
#include <migraphx/kernels/args.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/types.hpp>
namespace migraphx {
...
...
src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
View file @
5af9aac0
...
...
@@ -112,14 +112,8 @@ inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
#ifdef MIGRAPHX_USE_CLANG_TIDY
#define MIGRAPHX_DEVICE_SHARED
#else
// Workaround hcc's broken tile_static macro
#ifdef tile_static
#undef tile_static
#define MIGRAPHX_DEVICE_SHARED __attribute__((tile_static))
#else
#define MIGRAPHX_DEVICE_SHARED __shared__
#endif
#endif
}
// namespace device
}
// namespace gpu
...
...
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
View file @
5af9aac0
...
...
@@ -36,6 +36,7 @@ namespace gpu {
namespace
device
{
#ifdef MIGRAPHX_NO_DPP
template
<
index_int
N
,
class
Op
,
class
T
,
...
...
@@ -62,6 +63,7 @@ __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
}
return
buffer
[
0
];
}
#else
constexpr
unsigned
int
dpp_row_shr
(
unsigned
int
x
)
{
return
0x110u
|
x
;
}
...
...
@@ -96,11 +98,7 @@ __device__ T dpp_mov(T& x)
input
.
data
=
x
;
for
(
index_int
i
=
0
;
i
<
n
;
i
++
)
{
#if defined(__HCC__)
output
.
reg
[
i
]
=
__llvm_amdgcn_move_dpp
(
input
.
reg
[
i
],
DppCtrl
,
RowMask
,
BankMask
,
BoundCtrl
);
#else
output
.
reg
[
i
]
=
__hip_move_dpp
(
input
.
reg
[
i
],
DppCtrl
,
RowMask
,
BankMask
,
BoundCtrl
);
#endif
}
return
output
.
data
;
}
...
...
@@ -310,4 +308,4 @@ void reduce(hipStream_t stream,
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
#endif
// MIGRAPHX_NO_DPP
src/targets/gpu/fuse_ops.cpp
View file @
5af9aac0
...
...
@@ -553,11 +553,13 @@ struct find_gemm_pointwise
{
auto
matcher
()
const
{
return
precompile_name
(
"pointwise"
)(
auto
gemm_op
=
match
::
name
(
"gpu::gemm"
)(
match
::
nargs
(
3
),
match
::
used_once
()).
bind
(
"gemm"
);
auto
binary_op
=
match
::
all_of
(
match
::
nargs
(
3
),
match
::
either_arg
(
0
,
1
)(
match
::
any_of
(
match
::
standard_shape
(),
match
::
is_constant
()).
bind
(
"c"
),
match
::
name
(
"gpu::gemm"
)(
match
::
nargs
(
3
),
match
::
used_once
()).
bind
(
"gemm"
)));
match
::
any_of
(
match
::
standard_shape
(),
match
::
is_constant
()).
bind
(
"c"
),
gemm_op
));
auto
unary_op
=
match
::
all_of
(
match
::
nargs
(
2
),
match
::
arg
(
0
)(
gemm_op
));
return
precompile_name
(
"pointwise"
)(
match
::
any_of
(
binary_op
,
unary_op
));
}
// TODO: Move to matcher.hpp
...
...
@@ -589,61 +591,84 @@ struct find_gemm_pointwise
return
match
::
name
(
"@return"
)(
match
::
args
(
match
::
any_of
(
add
,
mul_add
,
add_mul
)));
}
static
auto
match_mul
(
const
std
::
string
&
input
)
{
auto
mul
=
match_mul_const
(
match_param
(
input
),
"alpha"
);
return
match
::
name
(
"@return"
)(
match
::
args
(
mul
));
}
static
float
get_float
(
instruction_ref
ins
)
{
return
ins
->
get_literal
().
at
<
float
>
();
}
template
<
class
Gemm
>
static
bool
update_gemm
(
Gemm
&
gemm
,
module_ref
pm
,
unsigned
input
)
{
auto
names
=
pm
->
get_parameter_names
();
if
(
names
.
size
()
!=
2
)
return
false
;
std
::
sort
(
names
.
begin
(),
names
.
end
());
unsigned
output
=
input
==
0
?
1
:
0
;
auto
mr
=
match
::
match_instruction
(
*
pm
,
std
::
prev
(
pm
->
end
()),
match_add
(
names
[
input
],
names
[
output
]));
if
(
mr
.
result
==
pm
->
end
())
return
false
;
if
(
contains
(
mr
.
instructions
,
"alpha_mul"
))
if
(
names
.
size
()
==
1
)
{
auto
mr
=
match
::
match_instruction
(
*
pm
,
std
::
prev
(
pm
->
end
()),
match_mul
(
names
[
input
]));
if
(
mr
.
result
==
pm
->
end
())
return
false
;
gemm
.
alpha
*=
get_float
(
mr
.
instructions
[
"alpha"
]);
else
if
(
contains
(
mr
.
instructions
,
"beta_mul"
))
gemm
.
beta
*=
get_float
(
mr
.
instructions
[
"beta"
]);
else
if
(
contains
(
mr
.
instructions
,
"gamma_mul"
)
)
return
true
;
}
else
if
(
names
.
size
()
==
2
)
{
gemm
.
alpha
*=
get_float
(
mr
.
instructions
[
"gamma"
]);
gemm
.
beta
*=
get_float
(
mr
.
instructions
[
"gamma"
]);
unsigned
output
=
input
==
0
?
1
:
0
;
auto
mr
=
match
::
match_instruction
(
*
pm
,
std
::
prev
(
pm
->
end
()),
match_add
(
names
[
input
],
names
[
output
]));
if
(
mr
.
result
==
pm
->
end
())
return
false
;
if
(
contains
(
mr
.
instructions
,
"alpha_mul"
))
gemm
.
alpha
*=
get_float
(
mr
.
instructions
[
"alpha"
]);
else
if
(
contains
(
mr
.
instructions
,
"beta_mul"
))
gemm
.
beta
*=
get_float
(
mr
.
instructions
[
"beta"
]);
else
if
(
contains
(
mr
.
instructions
,
"gamma_mul"
))
{
gemm
.
alpha
*=
get_float
(
mr
.
instructions
[
"gamma"
]);
gemm
.
beta
*=
get_float
(
mr
.
instructions
[
"gamma"
]);
}
return
true
;
}
else
{
return
false
;
}
return
true
;
}
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
gemm_ins
=
r
.
instructions
[
"gemm"
];
auto
c_ins
=
r
.
instructions
[
"c"
];
auto
gemm
=
any_cast
<
rocblas_gemm
<
op
::
dot
>>
(
gemm_ins
->
get_operator
());
// Already fused gemm
if
(
not
float_equal
(
gemm
.
beta
,
0
))
return
;
gemm
.
beta
=
1
;
if
(
ins
->
inputs
().
size
()
==
3
)
gemm
.
beta
=
1
;
if
(
not
update_gemm
(
gemm
,
ins
->
module_inputs
().
front
(),
ins
->
inputs
().
front
()
==
gemm_ins
?
0
:
1
))
return
;
// const-fold input if not standard shape since rocblas can't handle it
if
(
not
c_ins
->
get_shape
().
standard
())
{
auto
c
=
make_op
(
"contiguous"
);
auto
l
=
c
.
compute
(
c
.
compute_shape
({
c_ins
->
get_shape
()}),
{
c_ins
->
eval
()});
c_ins
=
m
.
add_literal
(
l
.
get_shape
(),
l
.
data
());
}
auto
inputs
=
gemm_ins
->
inputs
();
inputs
.
pop_back
();
inputs
.
push_back
(
c_ins
);
if
(
ins
->
inputs
().
size
()
==
3
)
{
auto
c_ins
=
r
.
instructions
[
"c"
];
// const-fold input if not standard shape since rocblas can't handle it
if
(
not
c_ins
->
get_shape
().
standard
())
{
auto
c
=
make_op
(
"contiguous"
);
auto
l
=
c
.
compute
(
c
.
compute_shape
({
c_ins
->
get_shape
()}),
{
c_ins
->
eval
()});
c_ins
=
m
.
add_literal
(
l
.
get_shape
(),
l
.
data
());
}
inputs
.
push_back
(
c_ins
);
}
inputs
.
push_back
(
ins
->
inputs
().
back
());
m
.
replace_instruction
(
ins
,
gemm
,
inputs
);
...
...
src/targets/gpu/include/migraphx/gpu/context.hpp
View file @
5af9aac0
...
...
@@ -30,6 +30,7 @@
#include <migraphx/gpu/hip.hpp>
#include <migraphx/env.hpp>
#include <migraphx/config.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <unordered_map>
#include <memory>
...
...
@@ -215,6 +216,10 @@ struct context
return
*
current_device
;
}
bool
get_exhaustive_tune_flag
()
const
{
return
exhaustive_tune
;
}
void
set_exhaustive_tune_flag
(
bool
t
)
{
exhaustive_tune
=
t
;
}
hip_device
::
stream
&
get_stream
()
{
return
get_current_device
().
get_stream
();
}
hip_device
::
stream
&
get_stream
(
std
::
size_t
n
)
{
return
get_current_device
().
get_stream
(
n
);
}
...
...
@@ -273,7 +278,8 @@ struct context
auto
v_streams
=
v
.
at
(
"streams"
);
std
::
size_t
n_streams
=
v_streams
.
without_key
().
to
<
std
::
size_t
>
();
this
->
current_device
=
std
::
make_shared
<
hip_device
>
(
0
,
n_streams
);
auto
device
=
get_device_id
();
this
->
current_device
=
std
::
make_shared
<
hip_device
>
(
device
,
n_streams
);
}
void
wait_for
(
any_ptr
queue
)
...
...
@@ -336,7 +342,8 @@ struct context
// TODO: Make this a vector to support multiple devices
std
::
shared_ptr
<
hip_device
>
current_device
;
std
::
vector
<
shared
<
hip_event_ptr
>>
events
;
bool
measure_perf
=
false
;
bool
exhaustive_tune
=
false
;
bool
measure_perf
=
false
;
// for event perf timing
shared
<
hip_event_ptr
>
start_event
=
nullptr
;
shared
<
hip_event_ptr
>
stop_event
=
nullptr
;
...
...
src/targets/gpu/include/migraphx/gpu/convolution.hpp
View file @
5af9aac0
...
...
@@ -175,8 +175,9 @@ struct miopen_convolution
auto
*
miopen_stream_handle
=
ctx
.
get_stream
().
get_miopen
();
solution_ptr
=
find_solution
(
miopen_stream_handle
,
conv_problem
.
get
());
auto
status
=
miopenGetSolutionWorkspaceSize
(
solution_ptr
.
get
(),
&
workspace_size
);
solution_ptr
=
find_solution
(
miopen_stream_handle
,
conv_problem
.
get
(),
ctx
.
get_exhaustive_tune_flag
());
auto
status
=
miopenGetSolutionWorkspaceSize
(
solution_ptr
.
get
(),
&
workspace_size
);
if
(
status
!=
miopenStatusSuccess
)
MIGRAPHX_THROW
(
"MIOpen"
+
op
.
name
()
+
" : failed to get solution's workspace size"
);
...
...
@@ -233,7 +234,7 @@ struct miopen_convolution
&
perf
,
workspace
.
implicit
(),
workspace_size
,
false
);
ctx
.
get_exhaustive_tune_flag
()
);
if
(
status
!=
miopenStatusSuccess
)
MIGRAPHX_THROW
(
"MIOpen "
+
op
.
name
()
+
" : find convolution failed"
);
algo
=
perf
.
fwd_algo
;
...
...
src/targets/gpu/include/migraphx/gpu/miopen.hpp
View file @
5af9aac0
...
...
@@ -75,12 +75,19 @@ using miopen_find_options = MIGRAPHX_MANAGE_PTR(miopenFindOptions_t, miopenDestr
using
miopen_problem
=
MIGRAPHX_MANAGE_PTR
(
miopenProblem_t
,
miopenDestroyProblem
);
using
miopen_solution
=
MIGRAPHX_MANAGE_PTR
(
miopenSolution_t
,
miopenDestroySolution
);
inline
miopen_solution
find_solution
(
miopenHandle_t
handle
,
miopenProblem_t
problem
)
inline
miopen_solution
find_solution
(
miopenHandle_t
handle
,
miopenProblem_t
problem
,
bool
tune
=
false
)
{
miopenSolution_t
solution
;
size_t
found
=
0
;
auto
status
=
miopenFindSolutions
(
handle
,
problem
,
nullptr
,
&
solution
,
&
found
,
1
);
auto
result
=
miopen_solution
{
solution
};
size_t
found
=
0
;
miopen_find_options
fo
=
nullptr
;
if
(
tune
)
{
fo
=
make_obj
<
miopen_find_options
>
(
&
miopenCreateFindOptions
);
miopenSetFindOptionTuning
(
fo
.
get
(),
1
);
}
auto
status
=
miopenFindSolutions
(
handle
,
problem
,
fo
.
get
(),
&
solution
,
&
found
,
1
);
auto
result
=
miopen_solution
{
solution
};
if
(
status
!=
miopenStatusSuccess
or
found
==
0
)
MIGRAPHX_THROW
(
"MIOpen miopenFindSolutions failed"
);
return
result
;
...
...
src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
View file @
5af9aac0
...
...
@@ -30,14 +30,14 @@
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
struct
module
;
struct
module
_pass_manager
;
namespace
gpu
{
struct
prefuse_ops
{
std
::
string
name
()
const
{
return
"gpu::prefuse_ops"
;
}
void
apply
(
module
&
m
)
const
;
void
apply
(
module
_pass_manager
&
mp
m
)
const
;
};
}
// namespace gpu
...
...
src/targets/gpu/jit/reduce.cpp
View file @
5af9aac0
...
...
@@ -118,17 +118,17 @@ struct reduce_compiler : compiler<reduce_compiler>
options
.
virtual_inputs
=
reduce_dims
(
inputs
);
auto
faxis
=
find_fast_axis
({
options
.
virtual_inputs
.
front
()});
vectorize
vec
{};
// Vectorize if the axis is a reduction axis
if
(
options
.
virtual_inputs
.
back
().
lens
()[
faxis
]
==
1
)
{
vec
=
vectorize
::
elements
(
ctx
,
faxis
,
options
.
virtual_inputs
);
}
auto
relements
=
get_reduce_elements
(
options
.
virtual_inputs
)
/
vec
.
size
;
auto
nelements
=
options
.
virtual_inputs
.
back
().
elements
();
auto
algo
=
v
.
get
(
"algo"
,
get_reduce_algo
(
options
.
virtual_inputs
));
if
(
algo
==
"block"
)
{
// Vectorize if the axis is a reduction axis
if
(
options
.
virtual_inputs
.
back
().
lens
()[
faxis
]
==
1
)
vec
=
vectorize
::
elements
(
ctx
,
faxis
,
options
.
virtual_inputs
);
auto
relements
=
get_reduce_elements
(
options
.
virtual_inputs
)
/
vec
.
size
;
auto
block_size
=
compute_block_size
(
relements
,
256
);
if
(
relements
>
block_size
*
256
)
algo
=
"block_large"
;
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
nelements
*
block_size
,
256
),
block_size
);
}
...
...
@@ -156,16 +156,25 @@ struct reduce_compiler : compiler<reduce_compiler>
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
const
{
value
v
=
value
::
object
{};
auto
reduce_elements
=
get_reduce_elements
(
ins
->
inputs
());
value
v
=
value
::
object
{};
if
(
op
.
name
()
==
"reduce_sum"
)
{
v
[
"reduction"
]
=
"op::sum{}"
;
}
else
if
(
op
.
name
()
==
"reduce_mean"
)
{
v
[
"reduction"
]
=
"op::sum{}"
;
v
[
"write"
]
=
"op::mean{"
+
std
::
to_string
(
reduce_elements
)
+
"}"
;
auto
reduce_elements
=
get_reduce_elements
(
ins
->
inputs
());
auto
reduce_type
=
ins
->
inputs
().
front
()
->
get_shape
().
type
();
v
[
"reduction"
]
=
"op::sum{}"
;
std
::
string
mean
=
"op::mean<"
+
std
::
to_string
(
reduce_elements
)
+
">{}"
;
// Use float accumulator when reduction size is too large for half
if
(
reduce_type
==
shape
::
half_type
and
reduce_elements
>
16384
)
v
[
"read"
]
=
"compose("
+
mean
+
", op::convert_to<float>{})"
;
else
if
(
contains
({
shape
::
float_type
,
shape
::
half_type
,
shape
::
double_type
},
reduce_type
))
v
[
"read"
]
=
mean
;
else
v
[
"write"
]
=
mean
;
}
else
if
(
op
.
name
()
==
"reduce_max"
)
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
View file @
5af9aac0
...
...
@@ -105,7 +105,7 @@ constexpr auto array_for_each(T& x, Ts&... xs)
}
else
{
using
vec_type
=
std
::
remove_reference_t
<
decltype
(
array2vec
(
x
))
>
;
using
vec_type
=
remove_reference_t
<
decltype
(
array2vec
(
x
))
>
;
f
(
array2vec
(
x
),
__builtin_convertvector
(
array2vec
(
xs
),
vec_type
)...);
}
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
View file @
5af9aac0
...
...
@@ -178,5 +178,9 @@ MIGRAPHX_HIP_NORETURN inline __host__ __device__ void assert_fail(const source_l
#define MIGRAPHX_WARN(...)
#endif
#define MIGRAPHX_STATIC_ASSERT_FOR(...) \
static_assert(__VA_ARGS__); \
if constexpr(__VA_ARGS__)
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
View file @
5af9aac0
...
...
@@ -72,7 +72,7 @@ __device__ T dpp_mov(T& x)
}
return
output
.
data
;
}
#endif
#endif
// MIGRAPHX_HAS_DPP
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_DPP_HPP
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
View file @
5af9aac0
...
...
@@ -187,6 +187,14 @@ constexpr auto fold(F f)
return
[
=
](
auto
&&
...
xs
)
{
return
fold_impl
(
f
,
static_cast
<
decltype
(
xs
)
&&>
(
xs
)...);
};
}
template
<
class
...
Fs
>
constexpr
auto
compose
(
Fs
...
fs
)
{
return
fold
([](
auto
f
,
auto
g
)
{
return
[
=
](
auto
&&
...
xs
)
{
return
f
(
g
(
static_cast
<
decltype
(
xs
)
>
(
xs
)...));
};
})(
fs
...);
}
template
<
class
...
Ts
>
constexpr
auto
pack
(
Ts
...
xs
)
{
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment