Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f30e5975
Commit
f30e5975
authored
Oct 24, 2023
by
Jun Liu
Browse files
Merge branch 'develop' into amd-develop
parents
91b414cd
bec84efb
Changes
207
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
262 additions
and
190 deletions
+262
-190
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
...v3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
...nv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
...nv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
...v3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
...ht_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
...nv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
...nv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
..._operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+9
-10
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
...eration_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
+6
-14
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
...tance/gpu/normalization/device_groupnorm_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
...ation/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
...gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm2d_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm4d_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
...tance/gpu/normalization/normalization_instance_common.hpp
+121
-121
profiler/README.md
profiler/README.md
+8
-6
profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
...r/include/profiler/profile_elementwise_layernorm_impl.hpp
+14
-3
profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
...ude/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
+4
-1
profiler/include/profiler/profile_gemm_splitk_impl.hpp
profiler/include/profiler/profile_gemm_splitk_impl.hpp
+6
-3
profiler/include/profiler/profile_groupnorm_impl.hpp
profiler/include/profiler/profile_groupnorm_impl.hpp
+89
-27
No files found.
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
→
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/
xdl/
device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
View file @
f30e5975
File moved
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
→
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/
xdl/
device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
View file @
f30e5975
File moved
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
→
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/
xdl/
device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
View file @
f30e5975
File moved
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
→
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/
xdl/
device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
View file @
f30e5975
File moved
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
→
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/
xdl/
device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
View file @
f30e5975
File moved
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
→
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/
xdl/
device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
View file @
f30e5975
File moved
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
→
library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/
xdl/
device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
View file @
f30e5975
File moved
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
View file @
f30e5975
add_instance_library
(
device_grouped_conv3d_fwd_instance
set
(
GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
)
wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
)
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp
)
endif
()
add_instance_library
(
device_grouped_conv3d_fwd_instance
${
GROUPED_CONV3D_FWD
}
)
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
View file @
f30e5975
set
(
GROUPED_GEMM_FIXED_NK_INSTANCES
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp
)
endif
()
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp
)
endif
()
if
((
DTYPES MATCHES
"int8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp
)
endif
()
list
(
APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_kn_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_f16_f16_mk_nk_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_kn_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_fp8_f16_mk_nk_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp
)
add_instance_library
(
device_grouped_gemm_fixed_nk_instance
${
GROUPED_GEMM_FIXED_NK_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
View file @
f30e5975
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Pass
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
View file @
f30e5975
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
void
add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F32
,
F32
,
F
32
,
F
16
,
Swish
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F32
,
F32
,
F
16
,
F
32
,
Swish
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
View file @
f30e5975
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
void
add_device_normalization_rank_5_3_swish_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Swish
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Swish
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
View file @
f30e5975
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Pass
,
2
,
1
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
View file @
f30e5975
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
32
,
F
16
,
Pass
,
4
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F
16
,
F
32
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
...
...
library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
View file @
f30e5975
...
...
@@ -22,25 +22,25 @@ template <typename OutElementwise, index_t Rank, index_t Reduce>
using
device_normalization_f16_instances
=
// clang-format off
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
2
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
// clang-format on
>
;
...
...
@@ -48,150 +48,150 @@ template <typename OutElementwise, index_t Rank, index_t Reduce>
using
device_normalization_splitk_f16_instances
=
// clang-format off
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
2
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f32_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_splitk_f32_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f32_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_f32_f32_f16_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_splitk_f16_f32_f32_f16_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType,
SaveMeanInvStdDataType,
Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize
, SaveMeanInvStdScalarPerVector
>
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
,
1
>
,
// irregular size
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
2
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
,
DeviceNormalizationSplitKImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_f32_f32_f16_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
...
...
profiler/README.md
View file @
f30e5975
...
...
@@ -22,7 +22,7 @@ c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
```
## Profile 2
d
forward convolution kernels
## Profile 2
D
forward convolution kernels
```
bash
#arg1: tensor operation (conv=Convolution)
#arg2: data type (0=fp32, 1=fp16)
...
...
@@ -115,7 +115,7 @@ Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1
d
, 2=Conv2
d
, 3=Conv3
d
)
# Number of spatial dimensions (1=Conv1
D
, 2=Conv2
D
, 3=Conv3
D
)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
...
...
@@ -147,7 +147,9 @@ GB/s: 127.947
# arg1: tensor operation (grouped_conv_bwd_weight: Grouped Convolution Backward Weight)
# arg2: data type (0: Input fp32, Weight fp32, Output fp32
# 1: Input fp16, Weight fp16, Output fp16
# 2: Input bf16, Weight fp32, Output bf16)
# 2: Input bf16, Weight fp32, Output bf16
# 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8
# 4: Input int8, Weight int8, Output int8)
# arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, N, K, Ho, Wo]
# 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]
# 2: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]
...
...
@@ -156,7 +158,7 @@ GB/s: 127.947
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1
d
, 2=Conv2
d
, 3=Conv3
d
)
# Number of spatial dimensions (1=Conv1
D
, 2=Conv2
D
, 3=Conv3
D
)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
...
...
@@ -167,7 +169,7 @@ GB/s: 127.947
# SplitK
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx SplitK
./bin/ckProfiler grouped_conv_bwd_weight
1
0
1
1 0 1 2 32 256 256 512 3 3 28 28 1 1 1 1 1 0 0 0 1
./bin/ckProfiler grouped_conv_bwd_weight 1
1
0
1 0 1 2 32 256 256 512 3 3 28 28 1 1 1 1 1 0 0 0 1
```
...
...
@@ -199,7 +201,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
# arg7: time kernel (0: no, 1: yes)
# arg8: operation type (0: ImageToColumn, 1: ColumnToImage)
# Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1
d
, 2=Conv2
d
, 3=Conv3
d
)
# Number of spatial dimensions (1=Conv1
D
, 2=Conv2
D
, 3=Conv3
D
)
# G, N, K, C,
# <filter spatial dimensions>, (ie Y, X for 2D)
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
...
...
profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
View file @
f30e5975
...
...
@@ -80,6 +80,8 @@ bool profile_elementwise_layernorm_impl(int do_verification,
Tensor
<
BetaDataType
>
beta
(
gammaBetaLength
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
YDataType
>
host_y
(
length
);
Tensor
<
AccDataType
>
host_save_mean
({
M
});
Tensor
<
AccDataType
>
host_save_inv_std
({
M
});
switch
(
init_method
)
{
...
...
@@ -152,14 +154,23 @@ bool profile_elementwise_layernorm_impl(int do_verification,
BetaDataType
,
YDataType
,
AccDataType
,
AccDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
host_save_mean
,
host_save_inv_std
,
PassThrough
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
...
...
profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp
View file @
f30e5975
...
...
@@ -66,12 +66,15 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
BetaDataType
,
HDataType
,
AccDataType
,
AccDataType
,
HElementOp
,
2
,
1
>
;
Tensor
<
EMeanVarDataType
>
e_m_n
(
HostTensorDescriptor
{
M
,
N
});
Tensor
<
AccDataType
>
c_m_n
(
HostTensorDescriptor
{
M
,
N
});
Tensor
<
AccDataType
>
save_mean
({
M
});
Tensor
<
AccDataType
>
save_inv_std
({
M
});
auto
ref_gemm
=
ReferenceGemm
{};
auto
ref_gemm_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
@@ -97,7 +100,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
auto
ref_layernorm_invoker
=
ref_layernorm
.
MakeInvoker
();
auto
ref_layernorm_argument
=
ref_layernorm
.
MakeArgument
(
e_m_n
,
gamma_n
,
beta_n
,
h_m_n
,
h_element_op
,
{
M
,
N
},
{
1
},
epsilon
);
e_m_n
,
gamma_n
,
beta_n
,
h_m_n
,
save_mean
,
save_inv_std
,
h_element_op
,
{
M
,
N
},
{
1
},
epsilon
);
ref_layernorm_invoker
.
Run
(
ref_layernorm_argument
);
}
...
...
profiler/include/profiler/profile_gemm_splitk_impl.hpp
View file @
f30e5975
...
...
@@ -30,7 +30,8 @@ template <typename ADataType,
typename
CDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
typename
CLayout
,
typename
ComputeType
=
CDataType
>
bool
profile_gemm_splitk_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
...
...
@@ -103,7 +104,8 @@ bool profile_gemm_splitk_impl(int do_verification,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
CElementOp
,
ComputeType
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
...
@@ -120,7 +122,8 @@ bool profile_gemm_splitk_impl(int do_verification,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
CElementOp
,
ComputeType
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
profiler/include/profiler/profile_groupnorm_impl.hpp
View file @
f30e5975
...
...
@@ -21,8 +21,10 @@ namespace profiler {
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
>
typename
ComputeDataType
,
typename
YDataType
,
typename
SaveMeanInvStdDataType
,
bool
SaveMeanInvStd
>
bool
profile_groupnorm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
...
...
@@ -34,6 +36,7 @@ bool profile_groupnorm_impl(int do_verification,
if
(
length
.
size
()
!=
5
)
return
false
;
index_t
N
=
length
[
0
];
index_t
G
=
length
[
3
];
index_t
C
=
length
[
4
];
...
...
@@ -45,7 +48,14 @@ bool profile_groupnorm_impl(int do_verification,
Tensor
<
GammaDataType
>
gamma
(
gammaBetaLength
);
Tensor
<
BetaDataType
>
beta
(
gammaBetaLength
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
SaveMeanInvStdDataType
>
save_mean
({
N
,
G
});
Tensor
<
SaveMeanInvStdDataType
>
save_inv_std
({
N
,
G
});
Tensor
<
YDataType
>
host_y
(
length
);
Tensor
<
SaveMeanInvStdDataType
>
host_save_mean
({
N
,
G
});
Tensor
<
SaveMeanInvStdDataType
>
host_save_inv_std
({
N
,
G
});
std
::
vector
<
index_t
>
strideSaveMeanInvStd
=
{
1
};
switch
(
init_method
)
{
...
...
@@ -69,6 +79,9 @@ bool profile_groupnorm_impl(int do_verification,
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
save_mean_dev
(
sizeof
(
SaveMeanInvStdDataType
)
*
save_mean
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
save_inv_std_dev
(
sizeof
(
SaveMeanInvStdDataType
)
*
save_inv_std
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
...
...
@@ -78,8 +91,8 @@ bool profile_groupnorm_impl(int do_verification,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
5
,
3
>
;
...
...
@@ -97,38 +110,70 @@ bool profile_groupnorm_impl(int do_verification,
if
(
do_verification
)
{
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
PassThrough
>
;
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
SaveMeanInvStdDataType
,
ComputeDataType
,
PassThrough
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
length
,
1e-6
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
host_save_mean
,
host_save_inv_std
,
PassThrough
{},
length
,
1e-6
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
int
num_kernel
=
0
;
auto
f_get_argument
=
[
&
](
auto
&
inst_ptr
)
{
if
constexpr
(
SaveMeanInvStd
)
return
inst_ptr
->
MakeArgumentPointer
(
length
,
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
().
begin
(),
save_mean
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_inv_std
.
mDesc
.
GetStrides
().
begin
(),
save_inv_std
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-6
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
save_mean_dev
.
GetDeviceBuffer
(),
save_inv_std_dev
.
GetDeviceBuffer
(),
PassThrough
{});
else
return
inst_ptr
->
MakeArgumentPointer
(
length
,
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
().
begin
(),
save_mean
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_inv_std
.
mDesc
.
GetStrides
().
begin
(),
save_inv_std
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-6
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
PassThrough
{});
};
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-6
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
PassThrough
{});
auto
argument_ptr
=
f_get_argument
(
inst_ptr
);
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
...
...
@@ -152,6 +197,10 @@ bool profile_groupnorm_impl(int do_verification,
beta
.
mDesc
.
GetElementSize
()
*
sizeof
(
BetaDataType
)
+
y
.
mDesc
.
GetElementSize
()
*
sizeof
(
YDataType
);
if
constexpr
(
SaveMeanInvStd
)
num_bytes
+=
save_mean
.
mDesc
.
GetElementSpaceSize
()
*
sizeof
(
SaveMeanInvStdDataType
)
+
save_inv_std
.
mDesc
.
GetElementSpaceSize
()
*
sizeof
(
SaveMeanInvStdDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
...
...
@@ -168,9 +217,22 @@ bool profile_groupnorm_impl(int do_verification,
if
(
do_verification
)
{
y_dev
.
FromDevice
(
y
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
y
,
host_y
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
constexpr
(
SaveMeanInvStd
)
{
save_mean_dev
.
FromDevice
(
save_mean
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
save_mean
.
mData
,
host_save_mean
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
save_inv_std_dev
.
FromDevice
(
save_inv_std
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
save_inv_std
.
mData
,
host_save_inv_std
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
}
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"x : "
,
x
.
mData
,
","
)
<<
std
::
endl
;
...
...
Prev
1
…
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment