Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
3c95b34d
Commit
3c95b34d
authored
Apr 04, 2022
by
Shucai Xiao
Browse files
Merge branch 'layernorm_half2' into branch_for_ort2
parents
af110526
789f86fb
Changes
61
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
334 additions
and
383 deletions
+334
-383
src/include/migraphx/pass.hpp
src/include/migraphx/pass.hpp
+15
-11
src/include/migraphx/schedule_model.hpp
src/include/migraphx/schedule_model.hpp
+37
-31
src/include/migraphx/stream_model.hpp
src/include/migraphx/stream_model.hpp
+21
-14
src/include/migraphx/target.hpp
src/include/migraphx/target.hpp
+21
-14
src/include/migraphx/value.hpp
src/include/migraphx/value.hpp
+16
-7
src/py/migraphx_py.cpp
src/py/migraphx_py.cpp
+1
-1
src/register_target.cpp
src/register_target.cpp
+12
-2
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+9
-3
src/targets/gpu/compile_hip.cpp
src/targets/gpu/compile_hip.cpp
+10
-8
src/targets/gpu/compile_hip_code_object.cpp
src/targets/gpu/compile_hip_code_object.cpp
+26
-0
src/targets/gpu/compile_ops.cpp
src/targets/gpu/compile_ops.cpp
+6
-33
src/targets/gpu/compile_pointwise.cpp
src/targets/gpu/compile_pointwise.cpp
+0
-83
src/targets/gpu/compiler.cpp
src/targets/gpu/compiler.cpp
+39
-0
src/targets/gpu/device/layernorm.cpp
src/targets/gpu/device/layernorm.cpp
+98
-124
src/targets/gpu/driver/CMakeLists.txt
src/targets/gpu/driver/CMakeLists.txt
+2
-6
src/targets/gpu/driver/compile_op.cpp
src/targets/gpu/driver/compile_op.cpp
+3
-3
src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+0
-2
src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
...gets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+18
-0
src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
+0
-24
src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+0
-17
No files found.
src/include/migraphx/pass.hpp
View file @
3c95b34d
...
...
@@ -58,17 +58,20 @@ void module_pass_manager_apply(const T& x, module_pass_manager& mpm)
}
// namespace detail
/*
* Type-erased interface for:
*
* struct pass
* {
* std::string name() const;
* void apply(module_pass_manager & mpm) const;
* void apply(program & p) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct
pass
{
//
std
::
string
name
()
const
;
// (optional)
void
apply
(
module_pass_manager
&
mpm
)
const
;
// (optional)
void
apply
(
program
&
p
)
const
;
};
#else
struct
pass
{
...
...
@@ -303,6 +306,7 @@ inline const ValueType& any_cast(const pass& x)
throw
std
::
bad_cast
();
return
*
y
;
}
#endif
#endif
...
...
src/include/migraphx/schedule_model.hpp
View file @
3c95b34d
...
...
@@ -26,30 +26,35 @@ struct schedule_model
/// Get the number of concurrent instruction allowed
std
::
size_t
concurrency
()
const
;
/// Schedule a concurrent instruction
void
sched
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
n
)
const
;
void
sched
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
n
)
const
;
// Insert necessary waits before an instruction
void
wait
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
void
wait
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
// Insert necessary records after an instruction
void
record
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
void
record
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
/// Compute weights for an operation
std
::
size_t
weight
(
const
operation
&
op
)
const
;
};
#else
/*
* Type-erased interface for:
*
* struct schedule_model
* {
* std::size_t concurrency() const;
* void sched(module& p,instruction_ref ins,std::size_t n) const;
* void wait(module& p,instruction_ref ins,std::size_t wait_id) const;
* void record(module& p,instruction_ref ins,std::size_t wait_id) const;
* std::size_t weight(const operation& op) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct
schedule_model
{
//
std
::
size_t
concurrency
()
const
;
//
void
sched
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
n
)
const
;
//
void
wait
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
//
void
record
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
;
//
std
::
size_t
weight
(
const
operation
&
op
)
const
;
};
#else
struct
schedule_model
{
...
...
@@ -120,22 +125,22 @@ struct schedule_model
return
(
*
this
).
private_detail_te_get_handle
().
concurrency
();
}
void
sched
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
n
)
const
void
sched
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
n
)
const
{
assert
((
*
this
).
private_detail_te_handle_mem_var
);
(
*
this
).
private_detail_te_get_handle
().
sched
(
p
,
ins
,
n
);
(
*
this
).
private_detail_te_get_handle
().
sched
(
m
,
ins
,
n
);
}
void
wait
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
void
wait
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
{
assert
((
*
this
).
private_detail_te_handle_mem_var
);
(
*
this
).
private_detail_te_get_handle
().
wait
(
p
,
ins
,
wait_id
);
(
*
this
).
private_detail_te_get_handle
().
wait
(
m
,
ins
,
wait_id
);
}
void
record
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
void
record
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
{
assert
((
*
this
).
private_detail_te_handle_mem_var
);
(
*
this
).
private_detail_te_get_handle
().
record
(
p
,
ins
,
wait_id
);
(
*
this
).
private_detail_te_get_handle
().
record
(
m
,
ins
,
wait_id
);
}
std
::
size_t
weight
(
const
operation
&
op
)
const
...
...
@@ -159,9 +164,9 @@ struct schedule_model
virtual
const
std
::
type_info
&
type
()
const
=
0
;
virtual
std
::
size_t
concurrency
()
const
=
0
;
virtual
void
sched
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
n
)
const
=
0
;
virtual
void
wait
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
=
0
;
virtual
void
record
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
=
0
;
virtual
void
sched
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
n
)
const
=
0
;
virtual
void
wait
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
=
0
;
virtual
void
record
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
=
0
;
virtual
std
::
size_t
weight
(
const
operation
&
op
)
const
=
0
;
};
...
...
@@ -195,22 +200,22 @@ struct schedule_model
std
::
size_t
concurrency
()
const
override
{
return
private_detail_te_value
.
concurrency
();
}
void
sched
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
n
)
const
override
void
sched
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
n
)
const
override
{
private_detail_te_value
.
sched
(
p
,
ins
,
n
);
private_detail_te_value
.
sched
(
m
,
ins
,
n
);
}
void
wait
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
override
void
wait
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
override
{
private_detail_te_value
.
wait
(
p
,
ins
,
wait_id
);
private_detail_te_value
.
wait
(
m
,
ins
,
wait_id
);
}
void
record
(
module
&
p
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
override
void
record
(
module
&
m
,
instruction_ref
ins
,
std
::
size_t
wait_id
)
const
override
{
private_detail_te_value
.
record
(
p
,
ins
,
wait_id
);
private_detail_te_value
.
record
(
m
,
ins
,
wait_id
);
}
std
::
size_t
weight
(
const
operation
&
op
)
const
override
...
...
@@ -283,6 +288,7 @@ inline const ValueType& any_cast(const schedule_model& x)
throw
std
::
bad_cast
();
return
*
y
;
}
#endif
#endif
...
...
src/include/migraphx/stream_model.hpp
View file @
3c95b34d
...
...
@@ -36,20 +36,26 @@ struct stream_model
#else
/*
* Type-erased interface for:
*
* struct stream_model
* {
* std::size_t get_nstream() const;
* std::size_t get_stream(instruction_ref ins) const;
* std::size_t get_event_id(instruction_ref ins) const;
* bool has_stream(instruction_ref ins) const;
* bool is_record(instruction_ref ins) const;
* bool is_wait(instruction_ref ins) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct
stream_model
{
//
std
::
size_t
get_nstream
()
const
;
//
std
::
size_t
get_stream
(
instruction_ref
ins
)
const
;
//
std
::
size_t
get_event_id
(
instruction_ref
ins
)
const
;
//
bool
has_stream
(
instruction_ref
ins
)
const
;
//
bool
is_record
(
instruction_ref
ins
)
const
;
//
bool
is_wait
(
instruction_ref
ins
)
const
;
};
#else
struct
stream_model
{
...
...
@@ -296,6 +302,7 @@ inline const ValueType& any_cast(const stream_model& x)
throw
std
::
bad_cast
();
return
*
y
;
}
#endif
#endif
...
...
src/include/migraphx/target.hpp
View file @
3c95b34d
...
...
@@ -82,20 +82,26 @@ argument copy_from_target(T&, const argument& arg)
return
arg
;
}
/*
* Type-erased interface for:
*
* struct target
* {
* std::string name() const;
* std::vector<pass> get_passes(context& ctx,const compile_options& options) const;
* context get_context() const;
* argument copy_to(const argument& input) const;
* argument copy_from(const argument& input) const;
* argument allocate(const shape& s) const;
* };
*
*/
#ifdef TYPE_ERASED_DECLARATION
// Type-erased interface for:
struct
target
{
//
std
::
string
name
()
const
;
//
std
::
vector
<
pass
>
get_passes
(
context
&
ctx
,
const
compile_options
&
options
)
const
;
//
context
get_context
()
const
;
// (optional)
argument
copy_to
(
const
argument
&
input
)
const
;
// (optional)
argument
copy_from
(
const
argument
&
input
)
const
;
// (optional)
argument
allocate
(
const
shape
&
s
)
const
;
};
#else
struct
target
{
...
...
@@ -382,6 +388,7 @@ inline const ValueType& any_cast(const target& x)
throw
std
::
bad_cast
();
return
*
y
;
}
#endif
#endif
...
...
src/include/migraphx/value.hpp
View file @
3c95b34d
...
...
@@ -178,6 +178,7 @@ struct value
value
(
std
::
nullptr_t
);
value
(
const
char
*
i
);
value
(
const
std
::
string
&
pkey
,
const
char
*
i
);
#define MIGRAPHX_VALUE_GENERATE_DECL_METHODS(vt, cpp_type) \
value(cpp_type i); \
...
...
@@ -188,6 +189,12 @@ struct value
const cpp_type* if_##vt() const;
MIGRAPHX_VISIT_VALUE_TYPES
(
MIGRAPHX_VALUE_GENERATE_DECL_METHODS
)
template
<
class
T
>
using
literal_to_string
=
std
::
conditional_t
<
(
std
::
is_convertible
<
T
,
const
char
*>
{}
and
std
::
is_convertible
<
T
,
std
::
string
>
{}),
std
::
string
,
T
>
;
template
<
class
T
>
using
pick_numeric
=
std
::
conditional_t
<
std
::
is_floating_point
<
T
>
{},
...
...
@@ -246,6 +253,7 @@ struct value
return
*
this
=
from_values
(
rhs
);
// NOLINT
}
value
&
operator
=
(
const
char
*
c
);
value
&
operator
=
(
std
::
nullptr_t
);
value
&
operator
=
(
const
std
::
initializer_list
<
value
>&
i
);
...
...
@@ -370,11 +378,11 @@ struct value
}
template
<
class
To
>
To
value_or
(
const
To
&
default_value
)
const
literal_to_string
<
To
>
value_or
(
const
To
&
default_value
)
const
{
if
(
this
->
is_null
())
return
default_value
;
return
to
<
To
>
();
return
to
<
literal_to_string
<
To
>
>
();
}
template
<
class
To
>
...
...
@@ -390,12 +398,12 @@ struct value
}
template
<
class
To
>
To
get
(
const
std
::
string
&
pkey
,
const
To
&
default_value
)
const
literal_to_string
<
To
>
get
(
const
std
::
string
&
pkey
,
const
To
&
default_value
)
const
{
const
auto
*
v
=
find
(
pkey
);
if
(
v
==
this
->
end
())
return
default_value
;
return
v
->
to
<
To
>
();
return
v
->
to
<
literal_to_string
<
To
>
>
();
}
template
<
class
To
>
...
...
@@ -408,10 +416,11 @@ struct value
}
template
<
class
To
>
std
::
vector
<
To
>
get
(
const
std
::
string
&
pkey
,
const
std
::
initializer_list
<
To
>&
default_value
)
const
std
::
vector
<
literal_to_string
<
To
>
>
get
(
const
std
::
string
&
pkey
,
const
std
::
initializer_list
<
To
>&
default_value
)
const
{
return
get
<
std
::
vector
<
To
>>
(
pkey
,
default_value
);
return
get
(
pkey
,
std
::
vector
<
literal_to_string
<
To
>>
{
default_value
.
begin
(),
default_value
.
end
()});
}
friend
bool
operator
==
(
const
value
&
x
,
const
value
&
y
);
...
...
src/py/migraphx_py.cpp
View file @
3c95b34d
...
...
@@ -213,7 +213,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
py
::
class_
<
migraphx
::
shape
>
(
m
,
"shape"
)
.
def
(
py
::
init
([](
py
::
kwargs
kwargs
)
{
auto
v
=
migraphx
::
to_value
(
kwargs
);
auto
t
=
migraphx
::
shape
::
parse_type
(
v
.
get
(
"type"
,
std
::
string
{
"float"
}
));
auto
t
=
migraphx
::
shape
::
parse_type
(
v
.
get
(
"type"
,
"float"
));
auto
lens
=
v
.
get
<
std
::
size_t
>
(
"lens"
,
{
1
});
if
(
v
.
contains
(
"strides"
))
return
migraphx
::
shape
(
t
,
lens
,
v
.
at
(
"strides"
).
to_vector
<
std
::
size_t
>
());
...
...
src/register_target.cpp
View file @
3c95b34d
#include <migraphx/register_target.hpp>
#include <unordered_map>
#include <migraphx/register_target.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -11,7 +11,17 @@ std::unordered_map<std::string, target>& target_map()
}
void
register_target
(
const
target
&
t
)
{
target_map
()[
t
.
name
()]
=
t
;
}
target
make_target
(
const
std
::
string
&
name
)
{
return
target_map
().
at
(
name
);
}
target
make_target
(
const
std
::
string
&
name
)
{
const
auto
it
=
target_map
().
find
(
name
);
if
(
it
==
target_map
().
end
())
{
MIGRAPHX_THROW
(
"Requested target '"
+
name
+
"' is not enabled or not supported"
);
}
return
it
->
second
;
}
std
::
vector
<
std
::
string
>
get_targets
()
{
std
::
vector
<
std
::
string
>
result
;
...
...
src/targets/gpu/CMakeLists.txt
View file @
3c95b34d
...
...
@@ -119,6 +119,7 @@ target_link_libraries(kernel_file_check compile_for_gpu)
rocm_clang_tidy_check
(
kernel_file_check
)
file
(
GLOB JIT_GPU_SRCS
${
CONFIGURE_DEPENDS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/jit/*.cpp
)
add_library
(
migraphx_gpu
abs.cpp
analyze_streams.cpp
...
...
@@ -131,9 +132,7 @@ add_library(migraphx_gpu
compile_ops.cpp
compile_hip.cpp
compile_hip_code_object.cpp
compile_pointwise.cpp
compile_roialign.cpp
compile_scatternd.cpp
compiler.cpp
concat.cpp
convert.cpp
convolution.cpp
...
...
@@ -171,6 +170,7 @@ add_library(migraphx_gpu
target.cpp
topk.cpp
write_literals.cpp
${
JIT_GPU_SRCS
}
)
set_target_properties
(
migraphx_gpu PROPERTIES EXPORT_NAME gpu
)
...
...
@@ -331,6 +331,12 @@ target_compile_definitions(migraphx_gpu PRIVATE
"-DMIGRAPHX_EXTRACT_KERNEL=
${
MIGRAPHX_EXTRACT_KERNEL
}
"
"-DMIGRAPHX_USE_HIPRTC=0"
)
if
(
DEFINED CMAKE_CXX_COMPILER_LAUNCHER
)
execute_process
(
COMMAND which
${
CMAKE_CXX_COMPILER_LAUNCHER
}
OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER
)
string
(
STRIP
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"-DMIGRAPHX_HIP_COMPILER_LAUNCHER=
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
)
endif
()
endif
()
# Check miopen find mode api
...
...
src/targets/gpu/compile_hip.cpp
View file @
3c95b34d
...
...
@@ -178,6 +178,12 @@ bool is_hip_clang_compiler()
return
result
;
}
bool
has_compiler_launcher
()
{
static
const
auto
result
=
fs
::
exists
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER_LAUNCHER
));
return
result
;
}
std
::
vector
<
std
::
vector
<
char
>>
compile_hip_src
(
const
std
::
vector
<
src_file
>&
srcs
,
std
::
string
params
,
const
std
::
string
&
arch
)
{
...
...
@@ -210,6 +216,10 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
src_compiler
compiler
;
compiler
.
flags
=
params
;
compiler
.
compiler
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
);
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
if
(
has_compiler_launcher
())
compiler
.
launcher
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER_LAUNCHER
);
#endif
if
(
is_hcc_compiler
())
compiler
.
process
=
[
&
](
const
fs
::
path
&
obj_path
)
->
fs
::
path
{
...
...
@@ -238,14 +248,6 @@ std::string enum_params(std::size_t count, std::string param)
return
join_strings
(
items
,
","
);
}
std
::
size_t
compute_global
(
std
::
size_t
n
,
std
::
size_t
local
)
{
std
::
size_t
groups
=
(
n
+
local
-
1
)
/
local
;
// max possible number of blocks is set to 1B (1,073,741,824)
std
::
size_t
nglobal
=
std
::
min
<
std
::
size_t
>
(
1073741824
,
groups
)
*
local
;
return
nglobal
;
}
#endif // MIGRAPHX_USE_HIPRTC
}
// namespace gpu
...
...
src/targets/gpu/compile_hip_code_object.cpp
View file @
3c95b34d
...
...
@@ -93,6 +93,32 @@ const std::vector<std::string>& compiler_warnings()
return
warnings
;
}
void
hip_compile_options
::
set_launch_params
(
const
value
&
v
,
const
std
::
function
<
std
::
size_t
(
std
::
size_t
local
)
>&
compute_global
,
std
::
size_t
default_local
)
{
local
=
v
.
get
(
"local"
,
default_local
);
if
(
v
.
contains
(
"global"
))
global
=
v
.
at
(
"global"
).
to
<
std
::
size_t
>
();
else
global
=
compute_global
(
local
);
}
std
::
function
<
std
::
size_t
(
std
::
size_t
local
)
>
compute_global_for
(
context
&
ctx
,
std
::
size_t
n
,
std
::
size_t
over
)
{
assert
(
over
>
0
);
std
::
size_t
max_global
=
ctx
.
get_current_device
().
get_cu_count
()
*
ctx
.
get_current_device
().
get_max_workitems_per_cu
();
return
[
n
,
over
,
max_global
](
std
::
size_t
local
)
{
std
::
size_t
groups
=
(
n
+
local
-
1
)
/
local
;
std
::
size_t
max_blocks
=
max_global
/
local
;
std
::
size_t
nglobal
=
std
::
min
(
max_blocks
*
over
,
groups
)
*
local
;
return
nglobal
;
};
}
operation
compile_hip_code_object
(
const
std
::
string
&
content
,
hip_compile_options
options
)
{
std
::
vector
<
src_file
>
srcs
;
...
...
src/targets/gpu/compile_ops.cpp
View file @
3c95b34d
...
...
@@ -6,7 +6,7 @@
#include <migraphx/par_for.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/op/identity.hpp>
#include <migraphx/gpu/compile
_pointwise
.hpp>
#include <migraphx/gpu/compile
r
.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -40,35 +40,9 @@ struct precompile_op
MIGRAPHX_REGISTER_OP
(
precompile_op
);
struct
pointwise_compiler
{
std
::
string
name
()
const
{
return
"pointwise"
;
}
operation
apply
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
)
const
{
assert
(
not
ins
->
module_inputs
().
empty
());
auto
*
pm
=
ins
->
module_inputs
().
front
();
return
compile_pointwise
(
ctx
,
to_shapes
(
ins
->
inputs
()),
*
pm
);
}
};
using
compiler_function
=
std
::
function
<
operation
(
context
&
,
instruction_ref
,
operation
)
>
;
template
<
class
T
>
compiler_function
make_compiler_function
(
T
x
)
{
return
{[
=
](
auto
&&
...
xs
)
{
return
x
.
apply
(
xs
...);
}};
}
template
<
class
...
Ts
>
std
::
unordered_map
<
std
::
string
,
compiler_function
>
make_compilers
(
Ts
...
xs
)
{
return
{{
xs
.
name
(),
make_compiler_function
(
xs
)}...};
}
struct
compiled_result
{
operation
op
;
compiler_replace
replace
;
instruction_ref
ins
;
};
...
...
@@ -82,7 +56,6 @@ void par_compile(std::size_t n, F f)
void
compile_ops
::
apply
(
module
&
m
)
const
{
auto
compilers
=
make_compilers
(
pointwise_compiler
{});
std
::
vector
<
std
::
function
<
compiled_result
()
>>
compiles
;
for
(
auto
ins
:
iterator_for
(
m
))
...
...
@@ -90,15 +63,15 @@ void compile_ops::apply(module& m) const
if
(
ins
->
name
()
!=
"gpu::precompile_op"
)
continue
;
operation
preop
=
any_cast
<
precompile_op
>
(
ins
->
get_operator
()).
op
;
assert
(
contains
(
compilers
,
preop
.
name
()));
auto
c
=
compilers
[
preop
.
name
()]
;
compiles
.
emplace_back
([
=
]()
->
compiled_result
{
return
{
c
(
*
ctx
,
ins
,
preop
),
ins
};
});
compiles
.
emplace_back
([
=
]()
->
compiled_result
{
return
{
compile
(
*
ctx
,
ins
,
preop
),
ins
}
;
});
}
std
::
vector
<
compiled_result
>
results
(
compiles
.
size
());
par_compile
(
compiles
.
size
(),
[
&
](
auto
i
)
{
results
[
i
]
=
compiles
[
i
]();
});
for
(
const
auto
&
cr
:
results
)
{
m
.
replace
_instruction
(
cr
.
ins
,
cr
.
op
,
cr
.
ins
->
inputs
()
);
cr
.
replace
(
m
,
cr
.
ins
);
}
}
...
...
src/targets/gpu/compile_pointwise.cpp
deleted
100644 → 0
View file @
af110526
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/module.hpp>
#include <migraphx/pass_manager.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
static
const
char
*
const
pointwise_kernel
=
R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/pointwise.hpp>
#include <args.hpp>
namespace migraphx {
${preamble}
extern "C" {
__global__ void kernel(${params})
{
pointwise(${lambda}, ${args});
}
}
} // namespace migraphx
int main() {}
)__migraphx__"
;
operation
compile_pointwise
(
context
&
,
const
std
::
vector
<
shape
>&
inputs
,
const
std
::
string
&
lambda
,
const
std
::
string
&
preamble
)
{
hip_compile_options
options
;
options
.
global
=
compute_global
(
inputs
.
front
().
elements
());
options
.
local
=
1024
;
options
.
inputs
=
inputs
;
options
.
output
=
inputs
.
back
();
options
.
virtual_inputs
=
reduce_dims
(
inputs
);
options
.
params
=
"-Wno-float-equal"
;
auto
src
=
interpolate_string
(
pointwise_kernel
,
{{
"params"
,
enum_params
(
inputs
.
size
(),
"void * private_p"
)},
{
"args"
,
enum_params
(
inputs
.
size
(),
"private_p"
)},
{
"lambda"
,
lambda
},
{
"preamble"
,
preamble
}});
return
compile_hip_code_object
(
src
,
options
);
}
operation
compile_pointwise
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
module
m
)
{
run_passes
(
m
,
{
eliminate_common_subexpression
{},
dead_code_elimination
{}});
cpp_generator
g
;
g
.
fmap
([](
const
std
::
string
&
fname
)
{
return
"migraphx::"
+
fname
;
});
g
.
add_point_op
(
"where"
,
"${function:where}(${0}, ${1}, ${2})"
);
g
.
add_point_op
(
"prelu"
,
"${function:where}(${0} < 0, ${0} * ${1}, ${0})"
);
g
.
add_point_op
(
"sign"
,
"${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))"
);
g
.
add_point_op
(
"equal"
,
"migraphx::abs(${0} == ${1})"
);
g
.
add_point_op
(
"less"
,
"migraphx::abs(${0} < ${1})"
);
g
.
add_point_op
(
"greater"
,
"migraphx::abs(${0} > ${1})"
);
g
.
add_point_op
(
"not"
,
"migraphx::abs(not ${0})"
);
// Add explict conversions
g
.
fresult
(
[](
const
shape
&
s
)
{
return
"migraphx::convert<"
+
shape
::
cpp_type
(
s
.
type
())
+
">"
;
});
auto
name
=
g
.
create_function
(
g
.
generate_module
(
m
).
set_attributes
({
"__device__"
}).
set_generic_types
(
m
));
return
compile_pointwise
((
ctx
),
inputs
,
"MIGRAPHX_LIFT("
+
name
+
")"
,
g
.
str
());
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/compiler.cpp
0 → 100644
View file @
3c95b34d
#include <migraphx/gpu/compiler.hpp>
#include <utility>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
auto
&
compiler_map
()
{
static
std
::
unordered_map
<
std
::
string
,
compiler_compile
>
m
;
// NOLINT
return
m
;
}
auto
&
compiler_op_map
()
{
static
std
::
unordered_map
<
std
::
string
,
compiler_compile_op
>
m
;
// NOLINT
return
m
;
}
void
register_compiler
(
const
std
::
string
&
name
,
compiler_compile
c
,
compiler_compile_op
cop
)
{
compiler_map
()[
name
]
=
std
::
move
(
c
);
compiler_op_map
()[
name
]
=
std
::
move
(
cop
);
}
bool
has_compiler_for
(
const
std
::
string
&
name
)
{
return
compiler_map
().
count
(
name
)
>
0
;
}
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
{
return
compiler_map
().
at
(
op
.
name
())(
ctx
,
ins
,
op
);
}
operation
compile_op
(
const
std
::
string
&
name
,
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
const
value
&
v
)
{
return
compiler_op_map
().
at
(
name
)(
ctx
,
inputs
,
v
);
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/layernorm.cpp
View file @
3c95b34d
...
...
@@ -215,36 +215,22 @@ __device__ __half2 block_reduce_half2(
// m = x - mean(x)
// m / sqrt(mean(m ^ 2) + 1e-12)
__global__
void
triadd_layernorm_kernel_half2
(
void
*
in1
,
void
*
in2
,
void
*
in3
,
void
*
data_out
,
index_int
batch_item_num
,
index_int
block_size
)
__device__
void
layernorm_kernel_half2
(
__half2
*
in_data
,
__half2
*
in_data_reduce
,
__half2
*
out
,
index_int
batch_item_num
,
index_int
block_size
,
float
rbatch_num
)
{
__half2
*
input1
=
reinterpret_cast
<
__half2
*>
(
in1
);
__half2
*
input2
=
reinterpret_cast
<
__half2
*>
(
in2
);
__half2
*
input3
=
reinterpret_cast
<
__half2
*>
(
in3
);
__half2
*
output
=
reinterpret_cast
<
__half2
*>
(
data_out
);
auto
rnum
=
__float2half2_rn
(
1.0
f
/
batch_item_num
);
batch_item_num
/=
2
;
auto
rnum
=
__float2half2_rn
(
rbatch_num
);
extern
MIGRAPHX_DEVICE_SHARED
__half2
buffer2
[];
__half2
*
in_data_reduce
=
buffer2
;
__half2
*
in_data
=
buffer2
+
batch_item_num
;
int
start
=
blockIdx
.
x
*
batch_item_num
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
in_data
[
i
]
=
__hadd2
(
__hadd2
(
input1
[
idx
],
input2
[
idx
]),
input3
[
idx
]);
in_data_reduce
[
i
]
=
in_data
[
i
];
// in_data_reduce[i] = __hmul2(in_data[i], rnum);
}
auto
m
=
block_reduce_half2
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
half2_sum
{});
m
=
__hmul2
(
m
,
rnum
);
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
in_data
[
i
]
=
__hsub2
(
in_data
[
i
],
m
);
// in_data_reduce[i] = __hmul2(__hmul2(in_data[i], in_data[i]), rnum);
in_data
[
i
]
=
__hsub2
(
in_data
[
i
],
m
);
in_data_reduce
[
i
]
=
__hmul2
(
in_data
[
i
],
in_data
[
i
]);
}
...
...
@@ -255,11 +241,36 @@ __global__ void triadd_layernorm_kernel_half2(
auto
r
=
__hadd2
(
m
,
eps
);
r
=
h2rsqrt
(
r
);
int
start
=
blockIdx
.
x
*
batch_item_num
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
out
[
idx
]
=
__hmul2
(
in_data
[
i
],
r
);
}
}
__global__
void
triadd_layernorm_half2
(
void
*
in1
,
void
*
in2
,
void
*
in3
,
void
*
data_out
,
index_int
batch_item_num
,
index_int
block_size
)
{
__half2
*
input1
=
reinterpret_cast
<
__half2
*>
(
in1
);
__half2
*
input2
=
reinterpret_cast
<
__half2
*>
(
in2
);
__half2
*
input3
=
reinterpret_cast
<
__half2
*>
(
in3
);
__half2
*
output
=
reinterpret_cast
<
__half2
*>
(
data_out
);
float
rnum
=
1.0
f
/
batch_item_num
;
batch_item_num
/=
2
;
extern
MIGRAPHX_DEVICE_SHARED
__half2
buffer2
[];
__half2
*
in_data_reduce
=
buffer2
;
__half2
*
in_data
=
buffer2
+
batch_item_num
;
int
start
=
blockIdx
.
x
*
batch_item_num
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
output
[
idx
]
=
__hmul2
(
in_data
[
i
],
r
);
int
idx
=
i
+
start
;
in_data
[
i
]
=
__hadd2
(
__hadd2
(
input1
[
idx
],
input2
[
idx
]),
input3
[
idx
]);
in_data_reduce
[
i
]
=
in_data
[
i
];
}
layernorm_kernel_half2
(
in_data
,
in_data_reduce
,
output
,
batch_item_num
,
block_size
,
rnum
);
}
template
<
class
T
>
...
...
@@ -281,105 +292,59 @@ block_reduce_half(T* buffer, index_int batch_item_num, index_int tid, index_int
// m = x - mean(x)
// m / sqrt(mean(m ^ 2) + 1e-12)
__global__
void
triadd_layernorm_kernel_half
(
void
*
in1
,
void
*
in2
,
void
*
in3
,
void
*
data_out
,
index_int
batch_item_num
,
index_int
block_size
)
__device__
void
layernorm_kernel_half
(
__half
*
in_data
,
__half
*
in_data_reduce
,
__half
*
out
,
index_int
batch_item_num
,
index_int
block_size
,
float
rnum
)
{
__half
*
input1
=
reinterpret_cast
<
__half
*>
(
in1
);
__half
*
input2
=
reinterpret_cast
<
__half
*>
(
in2
);
__half
*
input3
=
reinterpret_cast
<
__half
*>
(
in3
);
__half
*
output
=
reinterpret_cast
<
__half
*>
(
data_out
);
extern
MIGRAPHX_DEVICE_SHARED
__half
bufferh
[];
__half
*
in_data_reduce
=
bufferh
;
__half
*
in_data
=
bufferh
+
batch_item_num
;
auto
m
=
block_reduce_half
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
);
m
*=
rnum
;
int
start
=
blockIdx
.
x
*
batch_item_num
;
auto
rnum
=
1.0
f
/
batch_item_num
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
in_data
[
i
]
=
__float2half
(
__half2float
(
input1
[
idx
])
+
__half2float
(
input2
[
idx
])
+
__half2float
(
input3
[
idx
]));
in_data_reduce
[
i
]
=
__float2half
(
__half2float
(
in_data
[
i
])
*
__half2float
(
rnum
));
in_data
[
i
]
=
__float2half
(
__half2float
(
in_data
[
i
])
-
__half2float
(
m
));
in_data_reduce
[
i
]
=
__float2half
(
__half2float
(
in_data
[
i
])
*
__half2float
(
in_data
[
i
]));
}
auto
m
=
block_reduce_half
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
);
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
in_data
[
i
]
=
__float2half
(
__half2float
(
in_data
[
i
])
-
__half2float
(
m
));
in_data_reduce
[
i
]
=
__float2half
(
__half2float
(
in_data
[
i
])
*
__half2float
(
in_data
[
i
])
*
__half2float
(
rnum
));
}
m
=
block_reduce_half
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
);
m
*=
rnum
;
m
+=
1.0e-12
f
;
m
=
__float2half
(
__half2float
(
block_reduce_half
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
))
+
1.0e-12
f
);
auto
r
=
__float2half
(
rsqrt
(
__half2float
(
m
)));
int
start
=
blockIdx
.
x
*
batch_item_num
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
output
[
idx
]
=
__float2half
(
__half2float
(
in_data
[
i
])
*
__half2float
(
r
));
}
}
template
<
class
T
>
__device__
T
block_reduce
(
T
*
buffer
,
index_int
batch_item_num
,
index_int
tid
,
index_int
block_size
)
{
__syncthreads
();
for
(
index_int
s
=
block_size
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
and
tid
+
s
<
batch_item_num
)
{
buffer
[
tid
]
=
buffer
[
tid
]
+
buffer
[
tid
+
s
];
}
__syncthreads
();
int
idx
=
i
+
start
;
out
[
idx
]
=
__float2half
(
__half2float
(
in_data
[
i
])
*
__half2float
(
r
));
}
return
buffer
[
0
];
}
// m = x - mean(x)
// m / sqrt(mean(m ^ 2) + 1e-12)
template
<
class
T
>
__global__
void
triadd_layernorm_kernel
(
__global__
void
triadd_layernorm_half
(
void
*
in1
,
void
*
in2
,
void
*
in3
,
void
*
data_out
,
index_int
batch_item_num
,
index_int
block_size
)
{
T
*
input1
=
reinterpret_cast
<
T
*>
(
in1
);
T
*
input2
=
reinterpret_cast
<
T
*>
(
in2
);
T
*
input3
=
reinterpret_cast
<
T
*>
(
in3
);
T
*
output
=
reinterpret_cast
<
T
*>
(
data_out
);
extern
MIGRAPHX_DEVICE_SHARED
T
buffer
[];
T
*
in_data_reduce
=
buffer
;
T
*
in_data
=
buffer
+
batch_item_num
;
__half
*
input1
=
reinterpret_cast
<
__half
*>
(
in1
);
__half
*
input2
=
reinterpret_cast
<
__half
*>
(
in2
);
__half
*
input3
=
reinterpret_cast
<
__half
*>
(
in3
);
__half
*
output
=
reinterpret_cast
<
__half
*>
(
data_out
);
float
rnum
=
1.0
f
/
batch_item_num
;
extern
MIGRAPHX_DEVICE_SHARED
__half
bufferh
[];
__half
*
in_data_reduce
=
bufferh
;
__half
*
in_data
=
bufferh
+
batch_item_num
;
int
start
=
blockIdx
.
x
*
batch_item_num
;
auto
rnum
=
1.0
f
/
batch_item_num
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
in_data
[
i
]
=
input1
[
idx
]
+
input2
[
idx
]
+
input3
[
idx
];
in_data_reduce
[
i
]
=
in_data
[
i
];
// in_data_reduce[i] = __half2float(in_data[i]) * rnum;
}
auto
m
=
block_reduce
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
);
m
=
m
*
rnum
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
in
_data
[
i
]
=
in_data
[
i
]
-
m
;
in_data
_reduce
[
i
]
=
in_data
[
i
]
*
in_data
[
i
];
// in_data_reduce[i] = __half2float(in_data[i] * in_data[i]) * rnum
;
in
t
idx
=
i
+
start
;
in_data
[
i
]
=
__float2half
(
__half2float
(
input1
[
idx
])
+
__half2float
(
input2
[
idx
])
+
__half2float
(
input3
[
idx
]))
;
}
m
=
block_reduce
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
);
m
=
m
*
rnum
+
1.0e-12
f
;
auto
r
=
rsqrt
(
m
);
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
// output[idx] = __half2float(in_data[i]) * r;
output
[
idx
]
=
in_data
[
i
]
*
r
;
}
layernorm_kernel_half
(
in_data
,
in_data_reduce
,
output
,
batch_item_num
,
block_size
,
rnum
);
}
void
triadd_layernorm
(
hipStream_t
stream
,
...
...
@@ -393,12 +358,17 @@ void triadd_layernorm(hipStream_t stream,
auto
batch_item_num
=
in_s
.
lens
().
back
();
if
(
type
==
shape
::
half_type
and
(
batch_item_num
%
2
)
==
0
)
{
auto
half2_
block_size
=
compute_block_size
(
batch_item_num
,
1024
);
auto
block_size
=
compute_block_size
(
batch_item_num
,
1024
);
int
block_num
=
in_s
.
elements
()
/
batch_item_num
;
int
shared_size
=
batch_item_num
*
2
*
in_s
.
type_size
();
half2_block_size
=
half2_
block_size
/
4
;
triadd_layernorm_
kernel_
half2
<<<
block_num
,
half2_block_size
,
shared_size
,
stream
>>>
(
auto
half2_block_size
=
block_size
/
4
;
triadd_layernorm_half2
<<<
block_num
,
half2_block_size
,
shared_size
,
stream
>>>
(
arg1
.
data
(),
arg2
.
data
(),
arg3
.
data
(),
result
.
data
(),
batch_item_num
,
half2_block_size
);
// auto half_block_size = block_size / 2;
// triadd_layernorm_half2<<<block_num, half_block_size, shared_size, stream>>>(
// arg1.data(), arg2.data(), arg3.data(), result.data(), batch_item_num,
// half_block_size);
}
else
{
...
...
@@ -409,11 +379,11 @@ void triadd_layernorm(hipStream_t stream,
}
__global__
void
layernorm_
kernel_
half2
(
void
*
in1
,
void
*
data_out
,
index_int
batch_item_num
,
index_int
block_size
)
layernorm_half2
(
void
*
in1
,
void
*
data_out
,
index_int
batch_item_num
,
index_int
block_size
)
{
__half2
*
input1
=
reinterpret_cast
<
__half2
*>
(
in1
);
__half2
*
output
=
reinterpret_cast
<
__half2
*>
(
data_out
);
auto
rnum
=
__float2half2_rn
(
1.0
f
/
batch_item_num
)
;
float
rnum
=
1.0
f
/
batch_item_num
;
batch_item_num
/=
2
;
extern
MIGRAPHX_DEVICE_SHARED
__half2
buffer2
[];
__half2
*
in_data_reduce
=
buffer2
;
...
...
@@ -427,28 +397,28 @@ layernorm_kernel_half2(void* in1, void* data_out, index_int batch_item_num, inde
in_data_reduce
[
i
]
=
in_data
[
i
];
}
auto
m
=
block_reduce_half2
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
half2_sum
{});
m
=
__hmul2
(
m
,
rnum
);
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
in_data
[
i
]
=
__hsub2
(
in_data
[
i
],
m
);
in_data_reduce
[
i
]
=
__hmul2
(
in_data
[
i
],
in_data
[
i
]);
}
m
=
block_reduce_half2
(
in_data_reduce
,
batch_item_num
,
threadIdx
.
x
,
block_size
,
half2_sum
{});
m
=
__hmul2
(
m
,
rnum
);
layernorm_kernel_half2
(
in_data
,
in_data_reduce
,
output
,
batch_item_num
,
block_size
,
rnum
);
}
auto
eps
=
__float2half2_rn
(
1.0e-12
f
);
auto
r
=
__hadd2
(
m
,
eps
);
r
=
h2rsqrt
(
r
);
__global__
void
layernorm_half
(
void
*
in1
,
void
*
data_out
,
index_int
batch_item_num
,
index_int
block_size
)
{
__half
*
input1
=
reinterpret_cast
<
__half
*>
(
in1
);
__half
*
output
=
reinterpret_cast
<
__half
*>
(
data_out
);
float
rnum
=
1.0
f
/
batch_item_num
;
extern
MIGRAPHX_DEVICE_SHARED
__half
buffer3
[];
__half
*
in_data_reduce
=
buffer3
;
__half
*
in_data
=
buffer3
+
batch_item_num
;
int
start
=
blockIdx
.
x
*
batch_item_num
;
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_item_num
;
i
+=
block_size
)
{
int
idx
=
i
+
start
;
output
[
idx
]
=
__hmul2
(
in_data
[
i
],
r
);
int
idx
=
i
+
start
;
in_data
[
i
]
=
input1
[
idx
];
in_data_reduce
[
i
]
=
in_data
[
i
];
}
layernorm_kernel_half
(
in_data
,
in_data_reduce
,
output
,
batch_item_num
,
block_size
,
rnum
);
}
void
layernorm
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
)
...
...
@@ -458,12 +428,16 @@ void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
auto
batch_item_num
=
in_s
.
lens
().
back
();
if
(
type
==
shape
::
half_type
and
(
batch_item_num
%
2
)
==
0
)
{
auto
half2_
block_size
=
compute_block_size
(
batch_item_num
,
1024
);
auto
block_size
=
compute_block_size
(
batch_item_num
,
1024
);
int
block_num
=
in_s
.
elements
()
/
batch_item_num
;
int
shared_size
=
batch_item_num
*
2
*
in_s
.
type_size
();
half2_block_size
=
half2_
block_size
/
4
;
layernorm_
kernel_
half2
<<<
block_num
,
half2_block_size
,
shared_size
,
stream
>>>
(
auto
half2_block_size
=
block_size
/
4
;
layernorm_half2
<<<
block_num
,
half2_block_size
,
shared_size
,
stream
>>>
(
arg1
.
data
(),
result
.
data
(),
batch_item_num
,
half2_block_size
);
// auto half_block_size = block_size / 2;
// layernorm_half2<<<block_num, half_block_size, shared_size, stream>>>(
// arg1.data(), result.data(), batch_item_num, half_block_size);
}
else
{
...
...
src/targets/gpu/driver/CMakeLists.txt
View file @
3c95b34d
file
(
GLOB GPU_DRIVER_SRCS
${
CONFIGURE_DEPENDS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.cpp
)
add_executable
(
gpu-driver
action.cpp
compile_pointwise.cpp
main.cpp
parser.cpp
perf.cpp
run_op.cpp
${
GPU_DRIVER_SRCS
}
)
target_include_directories
(
gpu-driver PRIVATE include
)
target_link_libraries
(
gpu-driver PRIVATE migraphx_gpu
)
src/targets/gpu/driver/compile_p
ointwise
.cpp
→
src/targets/gpu/driver/compile_
o
p.cpp
100755 → 100644
View file @
3c95b34d
#include <migraphx/gpu/driver/action.hpp>
#include <migraphx/gpu/driver/perf.hpp>
#include <migraphx/gpu/compile
_pointwise
.hpp>
#include <migraphx/gpu/compile
r
.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
...
...
@@ -8,13 +8,13 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
driver
{
struct
compile_p
ointwise
:
action
<
compile_p
ointwise
>
struct
compile_
o
p
:
action
<
compile_
o
p
>
{
static
void
apply
(
const
parser
&
p
,
const
value
&
v
)
{
context
ctx
;
auto
inputs
=
p
.
parse_shapes
(
v
.
at
(
"inputs"
));
auto
op
=
gpu
::
compile_
pointwise
(
ctx
,
inputs
,
v
.
at
(
"
l
am
bda
"
).
to
<
std
::
string
>
());
auto
op
=
gpu
::
compile_
op
(
v
.
at
(
"
n
am
e
"
).
to
<
std
::
string
>
()
,
ctx
,
inputs
,
v
);
double
t
=
time_op
(
ctx
,
op
,
inputs
,
p
.
get
(
v
,
"iterations"
,
100
));
std
::
cout
<<
op
<<
": "
<<
t
<<
"ms"
<<
std
::
endl
;
}
...
...
src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
View file @
3c95b34d
...
...
@@ -17,8 +17,6 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
std
::
string
enum_params
(
std
::
size_t
count
,
std
::
string
param
);
std
::
size_t
compute_global
(
std
::
size_t
n
,
std
::
size_t
local
=
1024
);
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
...
...
src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
View file @
3c95b34d
...
...
@@ -8,6 +8,8 @@ namespace migraphx {
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
struct
context
;
struct
hip_compile_options
{
std
::
size_t
global
;
...
...
@@ -17,8 +19,24 @@ struct hip_compile_options
std
::
string
kernel_name
=
"kernel"
;
std
::
string
params
=
""
;
std
::
vector
<
shape
>
virtual_inputs
=
{};
/**
* @brief Set the launch parameters but allow v to override the values
*
* @param v A value class which can have a "global" and/or "local" keys to override the default
* global and local
* @param compute_global A function used to compute the global based on the local
* @param default_local The defaul local to use if its missing from the v parameter
*/
void
set_launch_params
(
const
value
&
v
,
const
std
::
function
<
std
::
size_t
(
std
::
size_t
local
)
>&
compute_global
,
std
::
size_t
default_local
=
1024
);
};
/// Compute global for n elements, but max out on target-specific upper limit
std
::
function
<
std
::
size_t
(
std
::
size_t
local
)
>
compute_global_for
(
context
&
ctx
,
std
::
size_t
n
,
std
::
size_t
over
=
1
);
operation
compile_hip_code_object
(
const
std
::
string
&
content
,
hip_compile_options
options
);
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
deleted
100644 → 0
View file @
af110526
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
struct
module
;
namespace
gpu
{
struct
context
;
operation
compile_pointwise
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
const
std
::
string
&
lambda
,
const
std
::
string
&
preamble
=
""
);
operation
compile_pointwise
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
module
m
);
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
deleted
100644 → 0
View file @
af110526
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
struct
context
;
operation
compile_roialign
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
io_shapes
,
const
value
&
val
);
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment