Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c5138aa1
"Src/YOLOV9.cpp" did not exist on "a63911bb575bea84b31c8cc7fc8dcc750e80e470"
Commit
c5138aa1
authored
Oct 19, 2023
by
Artur Wojcik
Browse files
Merge branch 'develop' into uif2-initial
parents
7830272f
82f3a835
Changes
104
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
444 additions
and
306 deletions
+444
-306
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
..._operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+9
-10
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
...eration_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
+6
-14
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
...tance/gpu/normalization/device_groupnorm_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
...ation/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
...gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm2d_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm4d_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
...tance/gpu/normalization/normalization_instance_common.hpp
+121
-121
profiler/README.md
profiler/README.md
+4
-4
profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
...r/include/profiler/profile_elementwise_layernorm_impl.hpp
+14
-3
profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
...ude/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
+4
-1
profiler/include/profiler/profile_groupnorm_impl.hpp
profiler/include/profiler/profile_groupnorm_impl.hpp
+89
-27
profiler/include/profiler/profile_layernorm_impl.hpp
profiler/include/profiler/profile_layernorm_impl.hpp
+89
-27
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+14
-4
profiler/src/profile_grouped_conv_bwd_weight.cpp
profiler/src/profile_grouped_conv_bwd_weight.cpp
+17
-21
profiler/src/profile_groupnorm.cpp
profiler/src/profile_groupnorm.cpp
+2
-2
profiler/src/profile_layernorm.cpp
profiler/src/profile_layernorm.cpp
+2
-2
test/CMakeLists.txt
test/CMakeLists.txt
+58
-58
test/contraction/CMakeLists.txt
test/contraction/CMakeLists.txt
+9
-7
test/conv_tensor_rearrange/CMakeLists.txt
test/conv_tensor_rearrange/CMakeLists.txt
+1
-0
No files found.
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
View file @
c5138aa1
add_instance_library
(
device_grouped_conv3d_fwd_instance
set
(
GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
)
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
)
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp
)
endif
()
add_instance_library
(
device_grouped_conv3d_fwd_instance
${
GROUPED_CONV3D_FWD
}
)
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
View file @
c5138aa1
set
(
GROUPED_GEMM_FIXED_NK_INSTANCES
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp
)
endif
()
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp
)
endif
()
if
((
DTYPES MATCHES
"int8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp
)
endif
()
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp
)
add_instance_library
(
device_grouped_gemm_fixed_nk_instance
${
GROUPED_GEMM_FIXED_NK_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
View file @
c5138aa1
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Pass
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
View file @
c5138aa1
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
void
add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F32
,
F32
,
F
32
,
F
16
,
Swish
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F32
,
F32
,
F
16
,
F
32
,
Swish
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
View file @
c5138aa1
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
void
add_device_normalization_rank_5_3_swish_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Swish
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Swish
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
View file @
c5138aa1
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Pass
,
2
,
1
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
View file @
c5138aa1
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Pass
,
4
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
View file @
c5138aa1
...
...
@@ -22,25 +22,25 @@ template <typename OutElementwise, index_t Rank, index_t Reduce>
using
device_normalization_f16_instances
=
// clang-format off
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
2
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
// clang-format on
>
;
...
...
@@ -48,150 +48,150 @@ template <typename OutElementwise, index_t Rank, index_t Reduce>
using
device_normalization_splitk_f16_instances
=
// clang-format off
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
2
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f32_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_splitk_f32_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f32_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_f32_f32_f16_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_splitk_f16_f32_f32_f16_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_f32_f32_f16_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
...
...
profiler/README.md
View file @
c5138aa1
...
...
@@ -22,7 +22,7 @@ c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
```
## Profile 2
d
forward convolution kernels
## Profile 2
D
forward convolution kernels
```
bash
#arg1: tensor operation (conv=Convolution)
#arg2: data type (0=fp32, 1=fp16)
...
...
@@ -115,7 +115,7 @@ Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1
d
, 2=Conv2
d
, 3=Conv3
d
)
# Number of spatial dimensions (1=Conv1
D
, 2=Conv2
D
, 3=Conv3
D
)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
...
...
@@ -158,7 +158,7 @@ GB/s: 127.947
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1
d
, 2=Conv2
d
, 3=Conv3
d
)
# Number of spatial dimensions (1=Conv1
D
, 2=Conv2
D
, 3=Conv3
D
)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
...
...
@@ -201,7 +201,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
# arg7: time kernel (0: no, 1: yes)
# arg8: operation type (0: ImageToColumn, 1: ColumnToImage)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1
d
, 2=Conv2
d
, 3=Conv3
d
)
# Number of spatial dimensions (1=Conv1
D
, 2=Conv2
D
, 3=Conv3
D
)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
...
...
profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
View file @
c5138aa1
...
...
@@ -80,6 +80,8 @@ bool profile_elementwise_layernorm_impl(int do_verification,
Tensor
<
BetaDataType
>
beta
(
gammaBetaLength
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
YDataType
>
host_y
(
length
);
Tensor
<
AccDataType
>
host_save_mean
({
M
});
Tensor
<
AccDataType
>
host_save_inv_std
({
M
});
switch
(
init_method
)
{
...
...
@@ -152,13 +154,22 @@ bool profile_elementwise_layernorm_impl(int do_verification,
BetaDataType
,
YDataType
,
AccDataType
,
AccDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
host_save_mean
,
host_save_inv_std
,
PassThrough
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
...
...
profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
View file @
c5138aa1
...
...
@@ -66,12 +66,15 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
BetaDataType
,
HDataType
,
AccDataType
,
AccDataType
,
HElementOp
,
2
,
1
>
;
Tensor
<
EMeanVarDataType
>
e_m_n
(
HostTensorDescriptor
{
M
,
N
});
Tensor
<
AccDataType
>
c_m_n
(
HostTensorDescriptor
{
M
,
N
});
Tensor
<
AccDataType
>
save_mean
({
M
});
Tensor
<
AccDataType
>
save_inv_std
({
M
});
auto
ref_gemm
=
ReferenceGemm
{};
auto
ref_gemm_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
@@ -97,7 +100,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
auto
ref_layernorm_invoker
=
ref_layernorm
.
MakeInvoker
();
auto
ref_layernorm_argument
=
ref_layernorm
.
MakeArgument
(
e_m_n
,
gamma_n
,
beta_n
,
h_m_n
,
h_element_op
,
{
M
,
N
},
{
1
},
epsilon
);
e_m_n
,
gamma_n
,
beta_n
,
h_m_n
,
save_mean
,
save_inv_std
,
h_element_op
,
{
M
,
N
},
{
1
},
epsilon
);
ref_layernorm_invoker
.
Run
(
ref_layernorm_argument
);
}
...
...
profiler/include/profiler/profile_groupnorm_impl.hpp
View file @
c5138aa1
...
...
@@ -21,8 +21,10 @@ namespace profiler {
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
>
typename
ComputeDataType
,
typename
YDataType
,
typename
SaveMeanInvStdDataType
,
bool
SaveMeanInvStd
>
bool
profile_groupnorm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
...
...
@@ -34,6 +36,7 @@ bool profile_groupnorm_impl(int do_verification,
if
(
length
.
size
()
!=
5
)
return
false
;
index_t
N
=
length
[
0
];
index_t
G
=
length
[
3
];
index_t
C
=
length
[
4
];
...
...
@@ -45,7 +48,14 @@ bool profile_groupnorm_impl(int do_verification,
Tensor
<
GammaDataType
>
gamma
(
gammaBetaLength
);
Tensor
<
BetaDataType
>
beta
(
gammaBetaLength
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
SaveMeanInvStdDataType
>
save_mean
({
N
,
G
});
Tensor
<
SaveMeanInvStdDataType
>
save_inv_std
({
N
,
G
});
Tensor
<
YDataType
>
host_y
(
length
);
Tensor
<
SaveMeanInvStdDataType
>
host_save_mean
({
N
,
G
});
Tensor
<
SaveMeanInvStdDataType
>
host_save_inv_std
({
N
,
G
});
std
::
vector
<
index_t
>
strideSaveMeanInvStd
=
{
1
};
switch
(
init_method
)
{
...
...
@@ -69,6 +79,9 @@ bool profile_groupnorm_impl(int do_verification,
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
save_mean_dev
(
sizeof
(
SaveMeanInvStdDataType
)
*
save_mean
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
save_inv_std_dev
(
sizeof
(
SaveMeanInvStdDataType
)
*
save_inv_std
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
...
...
@@ -78,8 +91,8 @@ bool profile_groupnorm_impl(int do_verification,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
5
,
3
>
;
...
...
@@ -97,29 +110,56 @@ bool profile_groupnorm_impl(int do_verification,
if
(
do_verification
)
{
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnorm
<
XDataType
,
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
SaveMeanInvStdDataType
,
ComputeDataType
,
PassThrough
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
length
,
1e-6
);
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
host_save_mean
,
host_save_inv_std
,
PassThrough
{},
length
,
1e-6
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
inst
ance
_ptr
s
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
auto
f_get_argument
=
[
&
](
auto
&
inst_ptr
)
{
if
constexpr
(
SaveMeanInvStd
)
return
inst_ptr
->
MakeArgumentPointer
(
length
,
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
().
begin
(),
save_mean
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_inv_std
.
mDesc
.
GetStrides
().
begin
(),
save_inv_std
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-6
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
save_mean_dev
.
GetDeviceBuffer
(),
save_inv_std_dev
.
GetDeviceBuffer
(),
PassThrough
{});
else
return
inst_ptr
->
MakeArgumentPointer
(
length
,
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
().
begin
(),
save_mean
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_inv_std
.
mDesc
.
GetStrides
().
begin
(),
save_inv_std
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-6
,
x_dev
.
GetDeviceBuffer
(),
...
...
@@ -129,6 +169,11 @@ bool profile_groupnorm_impl(int do_verification,
nullptr
,
nullptr
,
PassThrough
{});
};
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
f_get_argument
(
inst_ptr
);
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
...
...
@@ -152,6 +197,10 @@ bool profile_groupnorm_impl(int do_verification,
beta
.
mDesc
.
GetElementSize
()
*
sizeof
(
BetaDataType
)
+
y
.
mDesc
.
GetElementSize
()
*
sizeof
(
YDataType
);
if
constexpr
(
SaveMeanInvStd
)
num_bytes
+=
save_mean
.
mDesc
.
GetElementSpaceSize
()
*
sizeof
(
SaveMeanInvStdDataType
)
+
save_inv_std
.
mDesc
.
GetElementSpaceSize
()
*
sizeof
(
SaveMeanInvStdDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
...
...
@@ -168,9 +217,22 @@ bool profile_groupnorm_impl(int do_verification,
if
(
do_verification
)
{
y_dev
.
FromDevice
(
y
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
y
,
host_y
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
constexpr
(
SaveMeanInvStd
)
{
save_mean_dev
.
FromDevice
(
save_mean
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
save_mean
.
mData
,
host_save_mean
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
save_inv_std_dev
.
FromDevice
(
save_inv_std
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
save_inv_std
.
mData
,
host_save_inv_std
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
}
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"x : "
,
x
.
mData
,
","
)
<<
std
::
endl
;
...
...
profiler/include/profiler/profile_layernorm_impl.hpp
View file @
c5138aa1
...
...
@@ -21,6 +21,8 @@ template <typename XDataType,
typename
BetaDataType
,
typename
ComputeDataType
,
typename
YDataType
,
typename
SaveMeanInvStdDataType
,
bool
SaveMeanInvStd
,
index_t
Rank
>
bool
profile_layernorm_impl
(
int
do_verification
,
int
init_method
,
...
...
@@ -43,13 +45,19 @@ bool profile_layernorm_impl(int do_verification,
Tensor
<
GammaDataType
>
gamma
(
reduce_length
);
Tensor
<
BetaDataType
>
beta
(
reduce_length
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
SaveMeanInvStdDataType
>
save_mean
({
length
[
0
]});
Tensor
<
SaveMeanInvStdDataType
>
save_inv_std
({
length
[
0
]});
Tensor
<
YDataType
>
host_y
(
length
);
Tensor
<
SaveMeanInvStdDataType
>
host_save_mean
({
length
[
0
]});
Tensor
<
SaveMeanInvStdDataType
>
host_save_inv_std
({
length
[
0
]});
std
::
vector
<
index_t
>
strideXY
=
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideGammaBeta
=
strideXY
;
strideGammaBeta
[
0
]
=
0
;
std
::
vector
<
index_t
>
strideSaveMeanInvStd
=
{
1
};
switch
(
init_method
)
{
case
0
:
...
...
@@ -75,6 +83,9 @@ bool profile_layernorm_impl(int do_verification,
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
save_mean_dev
(
sizeof
(
SaveMeanInvStdDataType
)
*
save_mean
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
save_inv_std_dev
(
sizeof
(
SaveMeanInvStdDataType
)
*
save_inv_std
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
...
...
@@ -86,8 +97,8 @@ bool profile_layernorm_impl(int do_verification,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
...
...
@@ -105,31 +116,60 @@ bool profile_layernorm_impl(int do_verification,
if
(
do_verification
)
{
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
SaveMeanInvStdDataType
,
ComputeDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
length
,
reduce_dim
,
1e-4
);
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
host_save_mean
,
host_save_inv_std
,
PassThrough
{},
length
,
reduce_dim
,
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
inst
ance
_ptr
s
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
auto
f_get_argument
=
[
&
](
auto
&
inst_ptr
)
{
if
constexpr
(
SaveMeanInvStd
)
return
inst_ptr
->
MakeArgumentPointer
(
length
,
strideXY
,
strideGammaBeta
,
strideGammaBeta
,
strideXY
,
strideSaveMeanInvStd
,
strideSaveMeanInvStd
,
reduce_dim
,
1e-4
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
save_mean_dev
.
GetDeviceBuffer
(),
save_inv_std_dev
.
GetDeviceBuffer
(),
PassThrough
{});
else
return
inst_ptr
->
MakeArgumentPointer
(
length
,
strideXY
,
strideGammaBeta
,
strideGammaBeta
,
strideXY
,
strideSaveMeanInvStd
,
strideSaveMeanInvStd
,
reduce_dim
,
1e-4
,
x_dev
.
GetDeviceBuffer
(),
...
...
@@ -139,6 +179,11 @@ bool profile_layernorm_impl(int do_verification,
nullptr
,
nullptr
,
PassThrough
{});
};
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
f_get_argument
(
inst_ptr
);
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
...
...
@@ -168,6 +213,10 @@ bool profile_layernorm_impl(int do_verification,
beta
.
mDesc
.
GetElementSize
()
*
sizeof
(
BetaDataType
)
+
y
.
mDesc
.
GetElementSize
()
*
sizeof
(
YDataType
);
if
constexpr
(
SaveMeanInvStd
)
num_bytes
+=
save_mean
.
mDesc
.
GetElementSpaceSize
()
*
sizeof
(
SaveMeanInvStdDataType
)
+
save_inv_std
.
mDesc
.
GetElementSpaceSize
()
*
sizeof
(
SaveMeanInvStdDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
...
...
@@ -184,10 +233,23 @@ bool profile_layernorm_impl(int do_verification,
if
(
do_verification
)
{
y_dev
.
FromDevice
(
y
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
y
.
mData
,
host_y
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
constexpr
(
SaveMeanInvStd
)
{
save_mean_dev
.
FromDevice
(
save_mean
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
save_mean
.
mData
,
host_save_mean
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
save_inv_std_dev
.
FromDevice
(
save_inv_std
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
save_inv_std
.
mData
,
host_save_inv_std
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
}
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"x : "
,
x
.
mData
,
","
)
<<
std
::
endl
;
...
...
profiler/src/CMakeLists.txt
View file @
c5138aa1
...
...
@@ -25,8 +25,6 @@ set(PROFILER_SOURCES
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp
profile_contraction_bilinear.cpp
profile_contraction_scale.cpp
profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp
)
...
...
@@ -46,6 +44,11 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
list
(
APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp
)
list
(
APPEND PROFILER_SOURCES profile_contraction_scale.cpp
)
endif
()
set
(
PROFILER_EXECUTABLE ckProfiler
)
add_executable
(
${
PROFILER_EXECUTABLE
}
${
PROFILER_SOURCES
}
)
...
...
@@ -76,8 +79,6 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instan
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_softmax_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batchnorm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool3d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_avg_pool3d_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_max_pool_bwd_instance
)
...
...
@@ -85,9 +86,18 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_image_to_column_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_column_to_image_instance
)
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
endif
()
if
(
DL_KERNELS
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_fastgelu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_add_layernorm_instance
)
...
...
profiler/src/profile_grouped_conv_bwd_weight.cpp
View file @
c5138aa1
...
...
@@ -86,12 +86,8 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
#ifdef CK_ENABLE_FP8
using
F8
=
ck
::
f8_t
;
#endif
#ifdef CK_ENABLE_BF8
using
BF8
=
ck
::
bf8_t
;
#endif
using
namespace
ck
::
tensor_layout
::
convolution
;
...
...
@@ -141,59 +137,59 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
{
return
profile
(
I1
,
GNWC
{},
GKXC
{},
GNWK
{},
F32
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I1
,
GNWC
{},
GKXC
{},
GNWK
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I1
,
GNWC
{},
GKXC
{},
GNWK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
}
}
else
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
GNHWC_GKYXC_GNHWK
)
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
GNHWC_GKYXC_GNHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I2
,
GNHWC
{},
GKYXC
{},
GNHWK
{},
F32
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I2
,
GNHWC
{},
GKYXC
{},
GNHWK
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I2
,
GNHWC
{},
GKYXC
{},
GNHWK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
}
}
else
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
NHWGC_GKYXC_NHWGK
)
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
NHWGC_GKYXC_NHWGK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I2
,
NHWGC
{},
GKYXC
{},
NHWGK
{},
F32
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I2
,
NHWGC
{},
GKYXC
{},
NHWGK
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I2
,
NHWGC
{},
GKYXC
{},
NHWGK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
}
}
else
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
GNHWC_GKYXC_GNHWK
)
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
GNHWC_GKYXC_GNHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I3
,
GNDHWC
{},
GKZYXC
{},
GNDHWK
{},
F32
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I3
,
GNDHWC
{},
GKZYXC
{},
GNDHWK
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I3
,
GNDHWC
{},
GKZYXC
{},
GNDHWK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
...
...
@@ -204,22 +200,22 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
I3
,
GNDHWC
{},
GKZYXC
{},
GNDHWK
{},
int8_t
{},
int8_t
{},
int8_t
{},
int8_t
{},
int8_t
{});
}
}
else
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
NHWGC_GKYXC_NHWGK
)
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
NHWGC_GKYXC_NHWGK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F32
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F16
{},
F16
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
if
(
data_type
==
ConvDataType
::
BF16_F32_BF16
)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF16
{},
F32
{},
BF16
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16_BF8_F8
)
if
(
data_type
==
ConvDataType
::
F16_F16_F16_BF8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F16
{},
F16
{},
F16
{},
BF8
{},
F8
{});
}
...
...
profiler/src/profile_groupnorm.cpp
View file @
c5138aa1
...
...
@@ -93,12 +93,12 @@ int profile_groupnorm(int argc, char* argv[])
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_groupnorm_impl
<
F32
,
F32
,
F32
,
F32
,
F32
>
(
ck
::
profiler
::
profile_groupnorm_impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
false
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
ck
::
profiler
::
profile_groupnorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
>
(
ck
::
profiler
::
profile_groupnorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
false
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
...
...
profiler/src/profile_layernorm.cpp
View file @
c5138aa1
...
...
@@ -82,12 +82,12 @@ int profile_layernorm(int argc, char* argv[])
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
ck
::
profiler
::
profile_layernorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
rank
>
(
ck
::
profiler
::
profile_layernorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
false
,
rank
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_layernorm_impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
rank
>
(
ck
::
profiler
::
profile_layernorm_impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
false
,
rank
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
...
...
test/CMakeLists.txt
View file @
c5138aa1
test/contraction/CMakeLists.txt
View file @
c5138aa1
add_gtest_executable
(
test_contraction test_contraction.cpp
)
target_link_libraries
(
test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
if
((
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
)
OR NOT DEFINED DTYPES
)
add_gtest_executable
(
test_contraction test_contraction.cpp
)
target_link_libraries
(
test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance
)
add_gtest_executable
(
test_contraction_interface test_contraction_interface.cpp
)
target_link_libraries
(
test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance
)
set
(
target 1
)
endif
()
endif
()
endforeach
()
test/conv_tensor_rearrange/CMakeLists.txt
View file @
c5138aa1
add_gtest_executable
(
test_conv_tensor_rearrange test_conv_tensor_rearrange.cpp
)
target_link_libraries
(
test_conv_tensor_rearrange PRIVATE utility device_image_to_column_instance device_column_to_image_instance
)
add_gtest_executable
(
test_conv_tensor_rearrange_interface test_conv_tensor_rearrange_interface.cpp
)
target_link_libraries
(
test_conv_tensor_rearrange_interface PRIVATE utility
)
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment