Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4899c20f
Commit
4899c20f
authored
Sep 19, 2022
by
Chao Liu
Browse files
upadte profiler
parent
5729c23c
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
162 additions
and
232 deletions
+162
-232
library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp
...de/ck/library/tensor_operation_instance/gpu/layernorm.hpp
+36
-12
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
...tance/gpu/normalization/device_layernorm_f16_instance.cpp
+8
-9
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp
...tance/gpu/normalization/device_layernorm_f32_instance.cpp
+8
-9
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+0
-1
profiler/include/profile_groupnorm_impl.hpp
profiler/include/profile_groupnorm_impl.hpp
+32
-78
profiler/include/profile_layernorm_impl.hpp
profiler/include/profile_layernorm_impl.hpp
+21
-56
profiler/src/profile_groupnorm.cpp
profiler/src/profile_groupnorm.cpp
+21
-26
profiler/src/profile_layernorm.cpp
profiler/src/profile_layernorm.cpp
+2
-8
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+22
-19
test/layernorm/test_groupnorm_fp16.cpp
test/layernorm/test_groupnorm_fp16.cpp
+6
-7
test/layernorm/test_groupnorm_fp32.cpp
test/layernorm/test_groupnorm_fp32.cpp
+6
-7
No files found.
library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp
View file @
4899c20f
...
@@ -17,17 +17,25 @@ namespace tensor_operation {
...
@@ -17,17 +17,25 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_layernorm_f16_rank2_instances
(
// FP16
std
::
vector
<
DeviceLayernormPtr
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>&
);
void
add_device_layernorm_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_layernorm_
f16_rank4
_instances
(
void
add_device_layernorm_
rank_4_3_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
4
,
3
>>&
);
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
4
,
3
>>
>
&
);
void
add_device_layernorm_
f32_rank2
_instances
(
void
add_device_layernorm_
rank_5_3_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F32
,
F
32
,
F
32
,
F32
,
F
32
,
PassThrough
,
2
,
1
>>&
);
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F
16
,
F
16
,
F32
,
F
16
,
PassThrough
,
5
,
3
>
>>&
);
void
add_device_layernorm_f32_rank4_instances
(
// FP32
std
::
vector
<
DeviceLayernormPtr
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
4
,
3
>>&
);
void
add_device_layernorm_rank_2_1_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_layernorm_rank_4_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
4
,
3
>>>&
);
void
add_device_layernorm_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
5
,
3
>>>&
);
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
...
@@ -62,17 +70,33 @@ struct DeviceOperationInstanceFactory<
...
@@ -62,17 +70,33 @@ struct DeviceOperationInstanceFactory<
is_same_v
<
BetaDataType
,
F16
>
&&
is_same_v
<
YDataType
,
F16
>
)
is_same_v
<
BetaDataType
,
F16
>
&&
is_same_v
<
YDataType
,
F16
>
)
{
{
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
add_device_layernorm_f16_rank2_instances
(
op_ptrs
);
{
add_device_layernorm_rank_2_1_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
add_device_layernorm_f16_rank4_instances
(
op_ptrs
);
{
add_device_layernorm_rank_4_3_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
5
&&
NumReduceDim
==
3
)
{
add_device_layernorm_rank_5_3_f16_instances
(
op_ptrs
);
}
}
}
else
if
constexpr
(
is_same_v
<
XDataType
,
F32
>
&&
is_same_v
<
GammaDataType
,
F32
>
&&
else
if
constexpr
(
is_same_v
<
XDataType
,
F32
>
&&
is_same_v
<
GammaDataType
,
F32
>
&&
is_same_v
<
BetaDataType
,
F32
>
&&
is_same_v
<
YDataType
,
F32
>
)
is_same_v
<
BetaDataType
,
F32
>
&&
is_same_v
<
YDataType
,
F32
>
)
{
{
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
add_device_layernorm_f32_rank2_instances
(
op_ptrs
);
{
add_device_layernorm_rank_2_1_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
add_device_layernorm_f32_rank4_instances
(
op_ptrs
);
{
add_device_layernorm_rank_4_3_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
5
&&
NumReduceDim
==
3
)
{
add_device_layernorm_rank_5_3_f32_instances
(
op_ptrs
);
}
}
}
return
op_ptrs
;
return
op_ptrs
;
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
View file @
4899c20f
...
@@ -15,8 +15,7 @@ namespace instance {
...
@@ -15,8 +15,7 @@ namespace instance {
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Sigmoid
=
ck
::
tensor_operation
::
element_wise
::
Sigmoid
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_layernorm_f16_instances
=
std
::
tuple
<
using
device_layernorm_f16_instances
=
std
::
tuple
<
...
@@ -36,22 +35,22 @@ using device_layernorm_f16_instances = std::tuple<
...
@@ -36,22 +35,22 @@ using device_layernorm_f16_instances = std::tuple<
// clang-format on
// clang-format on
>
;
>
;
void
add_device_layernorm_
f16_rank2
_instances
(
void
add_device_layernorm_
rank_2_1_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
2
,
1
>
{});
}
}
void
add_device_layernorm_
f16_rank4
_instances
(
void
add_device_layernorm_
rank_4_3_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
4
,
3
>
{});
}
}
void
add_device_
groupnorm
_f16_instances
(
void
add_device_
layernorm_rank_5_3
_f16_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F16
,
F16
,
F16
,
F32
,
F16
,
Sigmoid
,
5
,
3
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Sigmoid
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
5
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp
View file @
4899c20f
...
@@ -14,8 +14,7 @@ namespace instance {
...
@@ -14,8 +14,7 @@ namespace instance {
using
F32
=
float
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Sigmoid
=
ck
::
tensor_operation
::
element_wise
::
Sigmoid
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_layernorm_f32_instances
=
std
::
tuple
<
using
device_layernorm_f32_instances
=
std
::
tuple
<
...
@@ -34,22 +33,22 @@ using device_layernorm_f32_instances = std::tuple<
...
@@ -34,22 +33,22 @@ using device_layernorm_f32_instances = std::tuple<
// clang-format on
// clang-format on
>
;
>
;
void
add_device_layernorm_
f32_
rank2_instances
(
void
add_device_layernorm_rank
_2_1_f3
2_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
2
,
1
>
{});
}
}
void
add_device_layernorm_
f32_rank4
_instances
(
void
add_device_layernorm_
rank_4_3_f32
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
4
,
3
>
{});
}
}
void
add_device_
groupnorm
_f32_instances
(
void
add_device_
layernorm_rank_5_3
_f32_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F32
,
F32
,
F32
,
F32
,
F32
,
Sigmoid
,
5
,
3
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Sigmoid
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
5
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
profiler/CMakeLists.txt
View file @
4899c20f
...
@@ -56,4 +56,3 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
...
@@ -56,4 +56,3 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_normalization_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_normalization_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
profiler/include/profile_groupnorm_impl.hpp
View file @
4899c20f
...
@@ -6,8 +6,8 @@
...
@@ -6,8 +6,8 @@
#include <iomanip>
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation
/gpu/device/device_
layernorm
_impl
.hpp"
#include "ck/
library/
tensor_operation
_instance/gpu/
layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -15,35 +15,9 @@
...
@@ -15,35 +15,9 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Sigmoid
=
ck
::
tensor_operation
::
element_wise
::
Sigmoid
;
void
add_device_groupnorm_f16_instances
(
std
::
vector
<
DeviceLayernormPtr
<
F16
,
F16
,
F16
,
F32
,
F16
,
Sigmoid
,
5
,
3
>>&
);
void
add_device_groupnorm_f32_instances
(
std
::
vector
<
DeviceLayernormPtr
<
F32
,
F32
,
F32
,
F32
,
F32
,
Sigmoid
,
5
,
3
>>&
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
ck
{
namespace
profiler
{
namespace
profiler
{
enum
struct
ElementwiseOpEnum
{
ePassthrough
=
0
,
eSigmoid
=
1
};
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
BetaDataType
,
...
@@ -53,12 +27,9 @@ bool profile_groupnorm_impl(int do_verification,
...
@@ -53,12 +27,9 @@ bool profile_groupnorm_impl(int do_verification,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_log
,
bool
time_kernel
,
bool
time_kernel
,
std
::
vector
<
index_t
>
length
,
std
::
vector
<
index_t
>
length
)
ElementwiseOpEnum
OutelementwiseOp
)
{
{
using
F16
=
ck
::
half_t
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
F32
=
float
;
using
Sigmoid
=
ck
::
tensor_operation
::
element_wise
::
Sigmoid
;
if
(
length
.
size
()
!=
5
)
if
(
length
.
size
()
!=
5
)
return
false
;
return
false
;
...
@@ -104,35 +75,21 @@ bool profile_groupnorm_impl(int do_verification,
...
@@ -104,35 +75,21 @@ bool profile_groupnorm_impl(int do_verification,
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
// add device normalization instances
// add device normalization instances
std
::
vector
<
tensor_operation
::
device
::
DeviceLayernormPtr
<
XDataType
,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceLayernorm
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
AccDataType
,
AccDataType
,
YDataType
,
YDataType
,
Sigmoid
,
PassThrough
,
5
,
5
,
3
>>
3
>
;
instances
;
// get device op instances
if
constexpr
(
is_same
<
XDataType
,
F16
>::
value
&&
is_same
<
GammaDataType
,
F16
>::
value
&&
const
auto
instance_ptrs
=
is_same
<
BetaDataType
,
F16
>::
value
&&
is_same
<
YDataType
,
F16
>::
value
&&
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
is_same
<
AccDataType
,
F32
>::
value
)
DeviceOp
>::
GetInstances
();
{
if
(
OutelementwiseOp
==
ElementwiseOpEnum
::
eSigmoid
)
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
tensor_operation
::
device
::
instance
::
add_device_groupnorm_f16_instances
(
instances
);
}
else
if
constexpr
(
is_same
<
XDataType
,
F32
>::
value
&&
is_same
<
GammaDataType
,
F32
>::
value
&&
is_same
<
BetaDataType
,
F32
>::
value
&&
is_same
<
YDataType
,
F32
>::
value
&&
is_same
<
AccDataType
,
F32
>::
value
)
{
if
(
OutelementwiseOp
==
ElementwiseOpEnum
::
eSigmoid
)
tensor_operation
::
device
::
instance
::
add_device_groupnorm_f32_instances
(
instances
);
}
if
(
instances
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device normalization instance found"
);
}
std
::
string
best_instance_name
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
...
@@ -140,25 +97,22 @@ bool profile_groupnorm_impl(int do_verification,
...
@@ -140,25 +97,22 @@ bool profile_groupnorm_impl(int do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
if
(
OutelementwiseOp
==
ElementwiseOpEnum
::
eSigmoid
)
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnorm
<
XDataType
,
{
GammaDataType
,
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnorm
<
XDataType
,
BetaDataType
,
GammaDataType
,
YDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
>
;
AccDataType
,
Sigmoid
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
length
,
1e-6
);
ReferenceInstance
ref
;
auto
ref_invoker
=
ref
.
MakeInvoker
();
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
Sigmoid
{},
length
,
1e-6
);
ref_invoker
.
Run
(
ref_argument
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
}
}
int
num_kernel
=
0
;
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instances
)
for
(
auto
&
inst_ptr
:
instance
_ptr
s
)
{
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
length
,
...
@@ -172,7 +126,7 @@ bool profile_groupnorm_impl(int do_verification,
...
@@ -172,7 +126,7 @@ bool profile_groupnorm_impl(int do_verification,
gamma_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
Sigmoid
{});
PassThrough
{});
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
...
...
profiler/include/profile_layernorm_impl.hpp
View file @
4899c20f
...
@@ -6,8 +6,8 @@
...
@@ -6,8 +6,8 @@
#include <iomanip>
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation
/gpu/device/device_
layernorm
_impl
.hpp"
#include "ck/
library/
tensor_operation
_instance/gpu/
layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -15,26 +15,6 @@
...
@@ -15,26 +15,6 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_layernorm_f16_rank2_instances
(
std
::
vector
<
DeviceLayernormPtr
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>&
);
void
add_device_layernorm_f32_rank2_instances
(
std
::
vector
<
DeviceLayernormPtr
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
2
,
1
>>&
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
ck
{
namespace
profiler
{
namespace
profiler
{
...
@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
...
@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
std
::
vector
<
index_t
>
strideGamma
,
std
::
vector
<
index_t
>
strideGamma
,
std
::
vector
<
index_t
>
strideBeta
)
std
::
vector
<
index_t
>
strideBeta
)
{
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
length
.
size
()
<
2
)
if
(
length
.
size
()
<
2
)
...
@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
...
@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
// add device normalization instances
constexpr
int
NumReduceDim
=
Rank
-
1
;
constexpr
int
NumReduceDim
=
Rank
-
1
;
std
::
vector
<
tensor_operation
::
device
::
DeviceLayernormPtr
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>>
instances
;
if
constexpr
(
is_same
<
XDataType
,
F16
>::
value
&&
is_same
<
GammaDataType
,
F16
>::
value
&&
is_same
<
BetaDataType
,
F16
>::
value
&&
is_same
<
YDataType
,
F16
>::
value
&&
is_same
<
AccDataType
,
F32
>::
value
)
{
if
(
length
.
size
()
==
2
)
tensor_operation
::
device
::
instance
::
add_device_layernorm_f16_rank2_instances
(
instances
);
}
else
if
constexpr
(
is_same
<
XDataType
,
F32
>::
value
&&
is_same
<
GammaDataType
,
F32
>::
value
&&
is_same
<
BetaDataType
,
F32
>::
value
&&
is_same
<
YDataType
,
F32
>::
value
&&
is_same
<
AccDataType
,
F32
>::
value
)
{
if
(
length
.
size
()
==
2
)
tensor_operation
::
device
::
instance
::
add_device_layernorm_f32_rank2_instances
(
instances
);
}
if
(
instances
.
size
()
<=
0
)
// add device normalization instances
{
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceLayernorm
<
XDataType
,
throw
std
::
runtime_error
(
"wrong! no device normalization instance found"
);
GammaDataType
,
}
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
...
@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
...
@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
}
}
for
(
auto
&
inst_ptr
:
instances
)
for
(
auto
&
inst_ptr
:
instance
_ptr
s
)
{
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
strideXY
,
strideXY
,
...
@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
...
@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
if
(
!
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
!
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"input lengths =
[
"
,
length
,
"
]
, "
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"input lengths = "
,
length
,
", "
)
<<
std
::
endl
;
return
;
continue
;
}
}
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
...
...
profiler/src/profile_groupnorm.cpp
View file @
4899c20f
...
@@ -5,10 +5,10 @@
...
@@ -5,10 +5,10 @@
#include <vector>
#include <vector>
#include <unordered_map>
#include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_groupnorm_impl.hpp"
#include "profiler/include/profile_groupnorm_impl.hpp"
using
ck
::
index_t
;
using
ck
::
index_t
;
using
ck
::
profiler
::
ElementwiseOpEnum
;
struct
GroupnormArgParser
struct
GroupnormArgParser
{
{
...
@@ -50,23 +50,21 @@ void print_help_groupnorm()
...
@@ -50,23 +50,21 @@ void print_help_groupnorm()
<<
"arg3: verification (0: no; 1: yes)
\n
"
<<
"arg3: verification (0: no; 1: yes)
\n
"
<<
"arg4: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg5: print tensor value (0: no; 1: yes)
\n
"
<<
"arg5: print tensor value (0: no; 1: yes)
\n
"
<<
"arg6: time kernel (0=n0, 1=yes)
\n
"
<<
"arg6: time kernel (0=no, 1=yes)
\n
"
<<
"arg7: out elementwise op (0=passthrough, 1=sigmoid)
\n
"
<<
"--length: tensor extents (e.g, --length 1 16 16 32 40)
\n
"
<<
"--length: tensor extents (e.g, --length 1 16 16 32 40)
\n
"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
int
profile_groupnorm
(
int
argc
,
char
*
argv
[])
int
profile_groupnorm
(
int
argc
,
char
*
argv
[])
{
{
ck
::
DataTypeEnum
data_type
=
ck
::
DataTypeEnum
::
Half
;
ck
::
DataTypeEnum
data_type
=
ck
::
DataTypeEnum
::
Half
;
bool
do_verification
=
false
;
bool
do_verification
=
false
;
int
init_method
=
0
;
int
init_method
=
0
;
bool
do_log
=
0
;
bool
do_log
=
0
;
bool
time_kernel
=
1
;
bool
time_kernel
=
1
;
ElementwiseOpEnum
outElementwiseOp
=
ElementwiseOpEnum
::
eSigmoid
;
std
::
vector
<
index_t
>
length
=
{
64
,
16
,
16
,
32
,
40
};
std
::
vector
<
index_t
>
length
=
{
1
,
16
,
16
,
32
,
40
};
if
(
argc
!=
1
&&
argc
!=
1
4
)
if
(
argc
!=
1
&&
argc
!=
1
3
)
{
{
print_help_groupnorm
();
print_help_groupnorm
();
return
0
;
return
0
;
...
@@ -74,12 +72,11 @@ int profile_groupnorm(int argc, char* argv[])
...
@@ -74,12 +72,11 @@ int profile_groupnorm(int argc, char* argv[])
if
(
argc
==
14
)
if
(
argc
==
14
)
{
{
data_type
=
static_cast
<
ck
::
DataTypeEnum
>
(
std
::
stoi
(
argv
[
2
]));
data_type
=
static_cast
<
ck
::
DataTypeEnum
>
(
std
::
stoi
(
argv
[
2
]));
do_verification
=
std
::
stoi
(
argv
[
3
]);
do_verification
=
std
::
stoi
(
argv
[
3
]);
init_method
=
std
::
stoi
(
argv
[
4
]);
init_method
=
std
::
stoi
(
argv
[
4
]);
do_log
=
std
::
stoi
(
argv
[
5
]);
do_log
=
std
::
stoi
(
argv
[
5
]);
time_kernel
=
std
::
stoi
(
argv
[
6
]);
time_kernel
=
std
::
stoi
(
argv
[
6
]);
outElementwiseOp
=
static_cast
<
ElementwiseOpEnum
>
(
std
::
stoi
(
argv
[
7
]));
// parse the long options
// parse the long options
GroupnormArgParser
arg_parser
;
GroupnormArgParser
arg_parser
;
...
@@ -90,10 +87,15 @@ int profile_groupnorm(int argc, char* argv[])
...
@@ -90,10 +87,15 @@ int profile_groupnorm(int argc, char* argv[])
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
&&
outElementwiseOp
==
ElementwiseOpEnum
::
eSigmoid
)
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_groupnorm_impl
<
F32
,
F32
,
F32
,
F32
,
F32
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
{
ck
::
profiler
::
profile_groupnorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
>
(
ck
::
profiler
::
profile_groupnorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
outElementwiseOp
);
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
}
else
else
{
{
...
@@ -102,10 +104,3 @@ int profile_groupnorm(int argc, char* argv[])
...
@@ -102,10 +104,3 @@ int profile_groupnorm(int argc, char* argv[])
return
0
;
return
0
;
}
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_groupnorm(argc, argv);
// return 0;
// }
profiler/src/profile_layernorm.cpp
View file @
4899c20f
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include <vector>
#include <vector>
#include <unordered_map>
#include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_layernorm_impl.hpp"
#include "profiler/include/profile_layernorm_impl.hpp"
using
ck
::
index_t
;
using
ck
::
index_t
;
...
@@ -49,7 +50,7 @@ void print_help_layernorm()
...
@@ -49,7 +50,7 @@ void print_help_layernorm()
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg5: time kernel (0=n
0
, 1=yes)
\n
"
<<
"arg5: time kernel (0=n
o
, 1=yes)
\n
"
<<
"--length: tensor extents (e.g, --length 1024 1024)
\n
"
<<
"--length: tensor extents (e.g, --length 1024 1024)
\n
"
<<
"--strideXY: tensor strides (e.g, --strideXY 1024 1)
\n
"
<<
"--strideXY: tensor strides (e.g, --strideXY 1024 1)
\n
"
<<
"--strideGamma: tensor strides (e.g, --strideGamma 1)
\n
"
<<
"--strideGamma: tensor strides (e.g, --strideGamma 1)
\n
"
...
@@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[])
...
@@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[])
return
0
;
return
0
;
}
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_layernorm(argc, argv);
// return 0;
// }
profiler/src/profiler.cpp
View file @
4899c20f
...
@@ -3,26 +3,27 @@
...
@@ -3,26 +3,27 @@
#include <cstring>
#include <cstring>
int
profile_gemm
(
int
,
char
*
[]);
//
int profile_gemm(int, char*[]);
int
profile_gemm_splitk
(
int
,
char
*
[]);
//
int profile_gemm_splitk(int, char*[]);
int
profile_gemm_bilinear
(
int
,
char
*
[]);
//
int profile_gemm_bilinear(int, char*[]);
int
profile_gemm_add_add_fastgelu
(
int
,
char
*
[]);
//
int profile_gemm_add_add_fastgelu(int, char*[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
//
int profile_gemm_reduce(int, char*[]);
int
profile_gemm_bias_add_reduce
(
int
,
char
*
[]);
//
int profile_gemm_bias_add_reduce(int, char*[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
//
int profile_batched_gemm(int, char*[]);
int
profile_batched_gemm_gemm
(
int
,
char
*
[]);
//
int profile_batched_gemm_gemm(int, char*[]);
int
profile_batched_gemm_add_relu_gemm_add
(
int
,
char
*
[]);
//
int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
int
profile_batched_gemm_reduce
(
int
,
char
*
[]);
//
int profile_batched_gemm_reduce(int, char*[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
//
int profile_grouped_gemm(int, char*[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
//
int profile_conv_fwd(int, char*[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
//
int profile_conv_fwd_bias_relu(int, char*[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
//
int profile_conv_fwd_bias_relu_add(int, char*[]);
int
profile_conv_bwd_data
(
int
,
char
*
[]);
//
int profile_conv_bwd_data(int, char*[]);
int
profile_conv_bwd_weight
(
int
,
char
*
[]);
//
int profile_conv_bwd_weight(int, char*[]);
int
profile_grouped_conv_fwd
(
int
,
char
*
[]);
//
int profile_grouped_conv_fwd(int, char*[]);
int
profile_normalization
(
int
,
char
*
[]);
//
int profile_normalization(int, char*[]);
int
profile_layernorm
(
int
,
char
*
[]);
int
profile_layernorm
(
int
,
char
*
[]);
int
profile_reduce
(
int
,
char
*
[]);
int
profile_groupnorm
(
int
,
char
*
[]);
// int profile_reduce(int, char*[]);
static
void
print_helper_message
()
static
void
print_helper_message
()
{
{
...
@@ -56,6 +57,7 @@ int main(int argc, char* argv[])
...
@@ -56,6 +57,7 @@ int main(int argc, char* argv[])
return
0
;
return
0
;
}
}
#if 0
else if(strcmp(argv[1], "gemm") == 0)
else if(strcmp(argv[1], "gemm") == 0)
{
{
return profile_gemm(argc, argv);
return profile_gemm(argc, argv);
...
@@ -132,6 +134,7 @@ int main(int argc, char* argv[])
...
@@ -132,6 +134,7 @@ int main(int argc, char* argv[])
{
{
return profile_normalization(argc, argv);
return profile_normalization(argc, argv);
}
}
#endif
else
if
(
strcmp
(
argv
[
1
],
"layernorm"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"layernorm"
)
==
0
)
{
{
return
profile_layernorm
(
argc
,
argv
);
return
profile_layernorm
(
argc
,
argv
);
...
...
test/layernorm/test_groupnorm_fp16.cpp
View file @
4899c20f
...
@@ -7,7 +7,6 @@
...
@@ -7,7 +7,6 @@
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
using
ck
::
index_t
;
using
ck
::
index_t
;
using
ck
::
profiler
::
ElementwiseOpEnum
;
template
<
typename
Tuple
>
template
<
typename
Tuple
>
class
TestGroupnorm
:
public
::
testing
::
Test
class
TestGroupnorm
:
public
::
testing
::
Test
...
@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
...
@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
for
(
auto
length
:
lengths
)
for
(
auto
length
:
lengths
)
{
{
bool
success
=
ck
::
profiler
::
profile_groupnorm_impl
<
XDataType
,
bool
success
=
Gamma
DataType
,
ck
::
profiler
::
profile_groupnorm_impl
<
X
DataType
,
Bet
aDataType
,
Gamm
aDataType
,
Acc
DataType
,
Beta
DataType
,
Y
DataType
>
(
Acc
DataType
,
true
,
2
,
false
,
false
,
length
,
ElementwiseOpEnum
::
eSigmoid
);
YDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
EXPECT_TRUE
(
success
);
}
}
}
}
...
...
test/layernorm/test_groupnorm_fp32.cpp
View file @
4899c20f
...
@@ -7,7 +7,6 @@
...
@@ -7,7 +7,6 @@
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
using
ck
::
index_t
;
using
ck
::
index_t
;
using
ck
::
profiler
::
ElementwiseOpEnum
;
template
<
typename
Tuple
>
template
<
typename
Tuple
>
class
TestGroupnorm
:
public
::
testing
::
Test
class
TestGroupnorm
:
public
::
testing
::
Test
...
@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
...
@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
for
(
auto
length
:
lengths
)
for
(
auto
length
:
lengths
)
{
{
bool
success
=
ck
::
profiler
::
profile_groupnorm_impl
<
XDataType
,
bool
success
=
Gamma
DataType
,
ck
::
profiler
::
profile_groupnorm_impl
<
X
DataType
,
Bet
aDataType
,
Gamm
aDataType
,
Acc
DataType
,
Beta
DataType
,
Y
DataType
>
(
Acc
DataType
,
true
,
2
,
false
,
false
,
length
,
ElementwiseOpEnum
::
eSigmoid
);
YDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
EXPECT_TRUE
(
success
);
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment