Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4be21df8
Commit
4be21df8
authored
Nov 26, 2022
by
Po-Yen, Chen
Browse files
Merge branch 'batchnorm_fwd_fix' into feature/restruct-ckprofiler
parents
02ff2522
3ebb4208
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
31 additions
and
237 deletions
+31
-237
library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
...brary/tensor_operation_instance/gpu/batchnorm_forward.hpp
+0
-13
library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt
...rc/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt
+0
-1
library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp
...ce/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp
+0
-145
profiler/include/profiler/profile_batchnorm_forward_impl.hpp
profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+30
-58
profiler/src/profile_batchnorm_fwd.cpp
profiler/src/profile_batchnorm_fwd.cpp
+1
-19
test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp
test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp
+0
-1
No files found.
library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
View file @
4be21df8
...
@@ -31,10 +31,6 @@ void add_device_batchnorm_forward_rank_4_3_bf16_instances(
...
@@ -31,10 +31,6 @@ void add_device_batchnorm_forward_rank_4_3_bf16_instances(
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchNormFwd
<
BF16
,
BF16
,
F32
,
BF16
,
BF16
,
F32
,
PassThrough
,
4
,
3
>>>&
);
std
::
unique_ptr
<
DeviceBatchNormFwd
<
BF16
,
BF16
,
F32
,
BF16
,
BF16
,
F32
,
PassThrough
,
4
,
3
>>>&
);
// Int8
void
add_device_batchnorm_forward_rank_4_3_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchNormFwd
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
PassThrough
,
4
,
3
>>>&
);
// FP64
// FP64
void
add_device_batchnorm_forward_rank_4_3_f64_instances
(
void
add_device_batchnorm_forward_rank_4_3_f64_instances
(
std
::
vector
<
std
::
vector
<
...
@@ -101,15 +97,6 @@ struct DeviceOperationInstanceFactory<
...
@@ -101,15 +97,6 @@ struct DeviceOperationInstanceFactory<
add_device_batchnorm_forward_rank_4_3_bf16_instances
(
op_ptrs
);
add_device_batchnorm_forward_rank_4_3_bf16_instances
(
op_ptrs
);
}
}
}
}
else
if
constexpr
(
is_same_v
<
XDataType
,
I8
>
&&
is_same_v
<
YDataType
,
I8
>
&&
is_same_v
<
AccDataType
,
F32
>
&&
is_same_v
<
ScaleDataType
,
I8
>
&&
is_same_v
<
BiasDataType
,
I8
>
&&
is_same_v
<
MeanVarDataType
,
F32
>
)
{
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
&&
is_same_v
<
YElementwiseOp
,
PassThrough
>
)
{
add_device_batchnorm_forward_rank_4_3_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
XDataType
,
F64
>
&&
is_same_v
<
YDataType
,
F64
>
&&
else
if
constexpr
(
is_same_v
<
XDataType
,
F64
>
&&
is_same_v
<
YDataType
,
F64
>
&&
is_same_v
<
AccDataType
,
F64
>
&&
is_same_v
<
ScaleDataType
,
F64
>
&&
is_same_v
<
AccDataType
,
F64
>
&&
is_same_v
<
ScaleDataType
,
F64
>
&&
is_same_v
<
BiasDataType
,
F64
>
&&
is_same_v
<
MeanVarDataType
,
F64
>
)
is_same_v
<
BiasDataType
,
F64
>
&&
is_same_v
<
MeanVarDataType
,
F64
>
)
...
...
library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt
View file @
4be21df8
...
@@ -2,6 +2,5 @@ add_instance_library(device_batchnorm_instance
...
@@ -2,6 +2,5 @@ add_instance_library(device_batchnorm_instance
device_batchnorm_forward_f16_instance.cpp
device_batchnorm_forward_f16_instance.cpp
device_batchnorm_forward_f32_instance.cpp
device_batchnorm_forward_f32_instance.cpp
device_batchnorm_forward_bf16_instance.cpp
device_batchnorm_forward_bf16_instance.cpp
device_batchnorm_forward_i8_instance.cpp
device_batchnorm_forward_f64_instance.cpp
device_batchnorm_forward_f64_instance.cpp
)
)
library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp
deleted
100644 → 0
View file @
02ff2522
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
I8
=
int8_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
// clang-format off
template
<
index_t
Rank
,
index_t
NumReduceDim
,
typename
YElementwiseOp
>
using
device_batchnorm_forward_i8_blockwise_instances
=
std
::
tuple
<
// XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
128
,
2
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
128
,
2
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
128
,
2
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
128
,
2
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
128
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
128
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
64
,
4
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
64
,
4
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
64
,
4
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
64
,
4
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
64
,
4
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
64
,
4
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
32
,
8
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
32
,
8
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
32
,
8
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
32
,
8
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
32
,
8
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
32
,
8
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
16
,
16
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
16
,
16
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
16
,
16
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
16
,
16
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
16
,
16
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
16
,
16
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
8
,
32
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
8
,
32
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
8
,
32
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
8
,
32
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
8
,
32
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
8
,
32
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
4
,
64
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
4
,
64
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
4
,
64
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
4
,
64
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
4
,
64
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
4
,
64
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
2
,
128
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
2
,
128
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
2
,
128
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
2
,
128
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
2
,
128
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
2
,
128
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
1
,
256
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
1
,
256
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
1
,
256
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
1
,
256
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
1
,
256
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
false
,
256
,
1
,
256
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
>
;
// clang-format on
// clang-format off
template
<
index_t
Rank
,
index_t
NumReduceDim
,
typename
YElementwiseOp
>
using
device_batchnorm_forward_i8_multiblock_instances
=
std
::
tuple
<
// XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
128
,
2
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
128
,
2
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
128
,
2
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
128
,
2
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
128
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
128
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
64
,
4
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
64
,
4
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
64
,
4
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
64
,
4
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
64
,
4
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
64
,
4
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
32
,
8
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
32
,
8
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
32
,
8
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
32
,
8
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
32
,
8
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
32
,
8
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
16
,
16
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
16
,
16
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
16
,
16
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
16
,
16
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
16
,
16
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
16
,
16
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
8
,
32
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
8
,
32
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
8
,
32
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
8
,
32
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
8
,
32
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
8
,
32
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
4
,
64
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
4
,
64
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
4
,
64
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
4
,
64
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
4
,
64
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
4
,
64
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
2
,
128
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
2
,
128
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
2
,
128
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
2
,
128
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
2
,
128
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
2
,
128
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
1
,
256
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
1
,
256
,
2
,
2
,
0
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
1
,
256
,
2
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
1
,
256
,
2
,
2
,
0
,
2
,
2
,
1
,
1
,
1
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
1
,
256
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
>
,
DeviceBatchNormFwdImpl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
YElementwiseOp
,
Rank
,
NumReduceDim
,
true
,
256
,
1
,
256
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
>
>
;
// clang-format on
void
add_device_batchnorm_forward_rank_4_3_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchNormFwd
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
PassThrough
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batchnorm_forward_i8_blockwise_instances
<
4
,
3
,
PassThrough
>
{});
add_device_operation_instances
(
instances
,
device_batchnorm_forward_i8_multiblock_instances
<
4
,
3
,
PassThrough
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profiler/profile_batchnorm_forward_impl.hpp
View file @
4be21df8
...
@@ -85,39 +85,22 @@ bool profile_batchnorm_forward_impl(int do_verification,
...
@@ -85,39 +85,22 @@ bool profile_batchnorm_forward_impl(int do_verification,
if
(
updateMovingAverage
)
if
(
updateMovingAverage
)
{
{
if
constexpr
(
ck
::
is_same_v
<
XDataType
,
int8_t
>
)
const
float
x_mean
=
0.0
f
;
{
const
float
x_stddev
=
1.0
f
;
x
.
GenerateTensorValue
(
GeneratorTensor_2
<
XDataType
>
{
-
5
,
5
},
num_thread
);
const
float
noise_stddev
=
0.04
f
;
const
float
x_mean
=
0.0
f
;
// input data in normal distribution
const
float
x_stddev
=
2.5
f
;
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
const
float
noise_stddev
=
0.04
f
;
// initialize the runningMean to be values with tiny variation to the mean of the x
resultRunningMean_ref
.
GenerateTensorValue
(
// values
GeneratorTensor_4
<
MeanVarDataType
>
{
x_mean
,
noise_stddev
},
num_thread
);
resultRunningMean_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_mean
,
noise_stddev
},
num_thread
);
resultRunningVariance_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_stddev
*
x_stddev
,
noise_stddev
},
num_thread
);
// initialize the runningVariance to be values with tiny variation to the variance of
}
// the x values
else
resultRunningVariance_ref
.
GenerateTensorValue
(
{
GeneratorTensor_4
<
MeanVarDataType
>
{
x_stddev
*
x_stddev
,
noise_stddev
},
num_thread
);
const
float
x_mean
=
0.0
f
;
const
float
x_stddev
=
1.0
f
;
const
float
noise_stddev
=
0.04
f
;
// input data in normal distribution
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
// initialize the runningMean to be values with tiny variation to the mean of the x
// values
resultRunningMean_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_mean
,
noise_stddev
},
num_thread
);
// initialize the runningVariance to be values with tiny variation to the variance of
// the x values
resultRunningVariance_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_stddev
*
x_stddev
,
noise_stddev
},
num_thread
);
};
}
}
else
else
{
{
...
@@ -129,35 +112,24 @@ bool profile_batchnorm_forward_impl(int do_verification,
...
@@ -129,35 +112,24 @@ bool profile_batchnorm_forward_impl(int do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
if
constexpr
(
ck
::
is_same_v
<
ScaleDataType
,
int8_t
>
&&
ck
::
is_same_v
<
BiasDataType
,
int8_t
>
)
switch
(
init_method
)
{
{
case
0
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_0
<
ScaleDataType
>
{},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_0
<
BiasDataType
>
{},
num_thread
);
break
;
case
1
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_1
<
ScaleDataType
>
{
1
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_1
<
BiasDataType
>
{
0
},
num_thread
);
break
;
case
2
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_2
<
ScaleDataType
>
{
-
5
,
5
},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_2
<
ScaleDataType
>
{
-
5
,
5
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_2
<
BiasDataType
>
{
-
5
,
5
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_2
<
BiasDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_3
<
ScaleDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_3
<
BiasDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
}
}
else
{
switch
(
init_method
)
{
case
0
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_0
<
ScaleDataType
>
{},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_0
<
BiasDataType
>
{},
num_thread
);
break
;
case
1
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_1
<
ScaleDataType
>
{
1
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_1
<
BiasDataType
>
{
0
},
num_thread
);
break
;
case
2
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_2
<
ScaleDataType
>
{
-
5
,
5
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_2
<
BiasDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_3
<
ScaleDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_3
<
BiasDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
}
};
};
};
// these buffers are usually provided by the user application
// these buffers are usually provided by the user application
...
...
profiler/src/profile_batchnorm_fwd.cpp
View file @
4be21df8
...
@@ -49,7 +49,7 @@ class BatchnormFwdArgParser
...
@@ -49,7 +49,7 @@ class BatchnormFwdArgParser
std
::
cout
<<
"--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc"
<<
std
::
endl
;
std
::
cout
<<
"--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc"
<<
std
::
endl
;
std
::
cout
<<
"--reduceDims or -R, comma separated list of dimensions to reduce on"
<<
std
::
endl
;
std
::
cout
<<
"--reduceDims or -R, comma separated list of dimensions to reduce on"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization"
<<
std
::
endl
;
std
::
cout
<<
"Arg1: data type (0: fp16, 1: fp32,
3: int8,
5: bp16, 6: fp64)"
<<
std
::
endl
;
std
::
cout
<<
"Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)"
<<
std
::
endl
;
std
::
cout
<<
"Arg3: 1/0 to indicate whether to save the calculated mean and invVariance (0=no, 1=yes)"
<<
std
::
endl
;
std
::
cout
<<
"Arg3: 1/0 to indicate whether to save the calculated mean and invVariance (0=no, 1=yes)"
<<
std
::
endl
;
std
::
cout
<<
"Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)"
<<
std
::
endl
;
std
::
cout
<<
"Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)"
<<
std
::
endl
;
...
@@ -142,7 +142,6 @@ int profile_batchnorm_forward(int argc, char* argv[])
...
@@ -142,7 +142,6 @@ int profile_batchnorm_forward(int argc, char* argv[])
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F64
=
double
;
using
F64
=
double
;
if
(
arg_parser
.
data_type
==
0
)
if
(
arg_parser
.
data_type
==
0
)
...
@@ -179,23 +178,6 @@ int profile_batchnorm_forward(int argc, char* argv[])
...
@@ -179,23 +178,6 @@ int profile_batchnorm_forward(int argc, char* argv[])
averageFactor
);
averageFactor
);
};
};
}
}
else
if
(
arg_parser
.
data_type
==
3
)
{
if
(
arg_parser
.
inLengths
.
size
()
==
4
&&
arg_parser
.
reduceDims
.
size
()
==
3
)
{
profile_batchnorm_forward_impl
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
,
4
,
3
>
(
arg_parser
.
do_verification
,
arg_parser
.
init_method
,
arg_parser
.
do_dumpout
,
arg_parser
.
time_kernel
,
arg_parser
.
inLengths
,
arg_parser
.
reduceDims
,
arg_parser
.
updateMovingAverage
,
arg_parser
.
saveMeanAndInvVariance
,
epsilon
,
averageFactor
);
};
}
else
if
(
arg_parser
.
data_type
==
5
)
else
if
(
arg_parser
.
data_type
==
5
)
{
{
if
(
arg_parser
.
inLengths
.
size
()
==
4
&&
arg_parser
.
reduceDims
.
size
()
==
3
)
if
(
arg_parser
.
inLengths
.
size
()
==
4
&&
arg_parser
.
reduceDims
.
size
()
==
3
)
...
...
test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp
View file @
4be21df8
...
@@ -90,7 +90,6 @@ class TestBatchNormFwdRank4 : public ::testing::Test
...
@@ -90,7 +90,6 @@ class TestBatchNormFwdRank4 : public ::testing::Test
using
KernelTypes
=
::
testing
::
Types
<
std
::
tuple
<
F16
,
F16
,
F32
,
F16
,
F16
,
F32
>
,
using
KernelTypes
=
::
testing
::
Types
<
std
::
tuple
<
F16
,
F16
,
F32
,
F16
,
F16
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
BF16
,
BF16
,
F32
,
BF16
,
BF16
,
F32
>
,
std
::
tuple
<
BF16
,
BF16
,
F32
,
BF16
,
BF16
,
F32
>
,
std
::
tuple
<
I8
,
I8
,
F32
,
I8
,
I8
,
F32
>
,
std
::
tuple
<
F64
,
F64
,
F64
,
F64
,
F64
,
F64
>>
;
std
::
tuple
<
F64
,
F64
,
F64
,
F64
,
F64
,
F64
>>
;
TYPED_TEST_SUITE
(
TestBatchNormFwdRank4
,
KernelTypes
);
TYPED_TEST_SUITE
(
TestBatchNormFwdRank4
,
KernelTypes
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment