Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
05ee41c3
Commit
05ee41c3
authored
Nov 30, 2022
by
Rosty Geyyer
Browse files
Merge branch 'develop' into lwpck-471
parents
37116c98
ad541ad6
Changes
436
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1128 additions
and
58 deletions
+1128
-58
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
...on_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+40
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+27
-0
library/src/utility/convolution_parameter.cpp
library/src/utility/convolution_parameter.cpp
+4
-8
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+9
-4
profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
...r/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
+6
-6
profiler/include/profile_batched_gemm_gemm_impl.hpp
profiler/include/profile_batched_gemm_gemm_impl.hpp
+6
-6
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+6
-6
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+12
-18
profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
+6
-6
profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp
...nclude/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+20
-2
profiler/include/profile_batchnorm_backward_impl.hpp
profiler/include/profile_batchnorm_backward_impl.hpp
+390
-0
profiler/include/profile_batchnorm_forward_impl.hpp
profiler/include/profile_batchnorm_forward_impl.hpp
+412
-0
profiler/include/profile_conv_bwd_data_impl.hpp
profiler/include/profile_conv_bwd_data_impl.hpp
+1
-2
No files found.
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_i8_i8_rank3_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_i8_i8_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_i8_i8_rank4_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce3_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/utility/convolution_parameter.cpp
View file @
05ee41c3
...
...
@@ -72,14 +72,10 @@ std::size_t ConvParam::GetFlops() const
{
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
G_
*
N_
*
K_
*
C_
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths_
),
std
::
begin
(
output_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths_
),
std
::
begin
(
filter_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
output_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
())
*
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
filter_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
());
}
std
::
string
get_conv_param_parser_helper_msg
()
...
...
profiler/CMakeLists.txt
View file @
05ee41c3
...
...
@@ -20,12 +20,14 @@ set(PROFILER_SOURCE
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_bwd_data.cpp
src/profile_conv_bwd_weight.cpp
src/profile_grouped_conv_fwd.cpp
src/profile_grouped_conv_bwd_weight.cpp
src/profile_reduce.cpp
src/profile_groupnorm.cpp
src/profile_layernorm.cpp
src/profile_softmax.cpp
src/profile_batchnorm_fwd.cpp
src/profile_batchnorm_bwd.cpp
)
add_executable
(
ckProfiler
${
PROFILER_SOURCE
}
)
...
...
@@ -49,11 +51,14 @@ target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries
(
ckProfiler PRIVATE device_conv1d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv3d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv1d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv3d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_
grouped_
conv1d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_
grouped_
conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_
grouped_
conv3d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_normalization_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_softmax_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batchnorm_instance
)
rocm_install
(
TARGETS ckProfiler COMPONENT profiler
)
profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
View file @
05ee41c3
...
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -111,15 +112,15 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -330,8 +331,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
{
e1_g_m_o_device_buf
.
FromDevice
(
e1_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
.
mData
,
e1_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
,
e1_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_gemm_impl.hpp
View file @
05ee41c3
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -105,15 +106,15 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -283,8 +284,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
{
c_g_m_o_device_buf
.
FromDevice
(
c_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
.
mData
,
c_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
,
c_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_impl.hpp
View file @
05ee41c3
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -50,15 +51,15 @@ bool profile_batched_gemm_impl(int do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -202,8 +203,7 @@ bool profile_batched_gemm_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_n_device_result
.
mData
,
c_g_m_n_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_n_device_result
,
c_g_m_n_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_reduce_impl.hpp
View file @
05ee41c3
...
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -78,15 +79,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
row
*
stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
col
*
stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -95,17 +96,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d0_g_m_host_result
({
BatchCount
,
M
});
Tensor
<
ReduceDataType
>
d1_g_m_host_result
({
BatchCount
,
M
});
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d0_g_m_device_result
({
BatchCount
,
M
});
Tensor
<
ReduceDataType
>
d1_g_m_device_result
({
BatchCount
,
M
});
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
...
...
@@ -319,12 +316,9 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
reduce0_device_buf
.
FromDevice
(
d0_g_m_device_result
.
mData
.
data
());
reduce1_device_buf
.
FromDevice
(
d1_g_m_device_result
.
mData
.
data
());
bool
c_error
=
ck
::
utils
::
check_err
(
c_g_m_n_device_result
.
mData
,
c_g_m_n_host_result
.
mData
);
bool
d0_error
=
ck
::
utils
::
check_err
(
d0_g_m_device_result
.
mData
,
d0_g_m_host_result
.
mData
);
bool
d1_error
=
ck
::
utils
::
check_err
(
d1_g_m_device_result
.
mData
,
d1_g_m_host_result
.
mData
);
bool
c_error
=
ck
::
utils
::
check_err
(
c_g_m_n_device_result
,
c_g_m_n_host_result
);
bool
d0_error
=
ck
::
utils
::
check_err
(
d0_g_m_device_result
,
d0_g_m_host_result
);
bool
d1_error
=
ck
::
utils
::
check_err
(
d1_g_m_device_result
,
d1_g_m_host_result
);
pass
=
pass
&&
(
c_error
==
true
);
pass
=
pass
&&
(
d0_error
==
true
);
...
...
profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
View file @
05ee41c3
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
...
...
@@ -113,15 +114,15 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -307,8 +308,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
{
c_g_m_o_device_buf
.
FromDevice
(
c_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
.
mData
,
c_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
,
c_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp
View file @
05ee41c3
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
...
...
@@ -308,8 +309,25 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
{
c_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
.
mData
,
c_gs_ms_os_host_result
.
mData
);
// default absolute error and relative error is 0.001
double
rtol
=
1e-3
;
double
atol
=
1e-3
;
// when BF16 is taken, set absolute error and relative error to 0.01
if
(
std
::
is_same_v
<
ADataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
B0DataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
B1DataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
CDataType
,
ck
::
bhalf_t
>
)
{
rtol
=
1e-2
;
atol
=
1e-2
;
}
pass
=
pass
&
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
,
c_gs_ms_os_host_result
,
"Error: Incorrect results!"
,
rtol
,
atol
);
if
(
do_log
)
{
...
...
profiler/include/profile_batchnorm_backward_impl.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
XDataType
,
typename
DxDataType
,
typename
DyDataType
,
typename
AccDataType
,
typename
ScaleDataType
,
typename
DscaleDbiasDataType
,
typename
MeanVarDataType
,
index_t
Rank
,
index_t
NumBatchNormReduceDim
>
bool
profile_batchnorm_backward_impl
(
bool
do_verification
,
int
init_method
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>
inOutLengths
,
const
std
::
vector
<
int
>
reduceDims
,
bool
haveSavedMeanInvVar
,
double
epsilon
)
{
if
(
inOutLengths
.
size
()
!=
Rank
||
reduceDims
.
size
()
!=
NumBatchNormReduceDim
)
{
throw
std
::
runtime_error
(
"Invalid tensor lengths or number of reduce dimensions!"
);
};
std
::
vector
<
size_t
>
scaleBiasMeanVarLengths
;
// used for calculating the effective transferred bytes by each operation
size_t
total_length
;
size_t
invariant_length
=
1
;
total_length
=
std
::
accumulate
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
1
,
std
::
multiplies
<
size_t
>
{});
if
(
std
::
any_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[](
int
d
)
{
return
d
<
0
||
d
>=
Rank
;
}))
throw
std
::
runtime_error
(
"Invalid reduce dimensions!"
);
for
(
int
dim
=
0
;
dim
<
Rank
;
dim
++
)
{
if
(
std
::
none_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[
&
](
int
d
)
{
return
dim
==
d
;
}))
{
scaleBiasMeanVarLengths
.
push_back
(
inOutLengths
[
dim
]);
invariant_length
*=
inOutLengths
[
dim
];
};
}
// input data of the batchnorm backward algorithm
Tensor
<
XDataType
>
x
(
inOutLengths
);
Tensor
<
DyDataType
>
dy
(
inOutLengths
);
Tensor
<
ScaleDataType
>
bnScale
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
savedMean
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
savedInvVar
(
scaleBiasMeanVarLengths
);
// savedVariance is only used for initializing savedInvVar
Tensor
<
MeanVarDataType
>
savedVariance
(
scaleBiasMeanVarLengths
);
// output data of the batchnorm backward algorithm
Tensor
<
DxDataType
>
dx_ref
(
inOutLengths
);
Tensor
<
DxDataType
>
dx
(
inOutLengths
);
Tensor
<
DscaleDbiasDataType
>
dscale
(
scaleBiasMeanVarLengths
);
Tensor
<
DscaleDbiasDataType
>
dbias
(
scaleBiasMeanVarLengths
);
Tensor
<
DscaleDbiasDataType
>
dscale_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
DscaleDbiasDataType
>
dbias_ref
(
scaleBiasMeanVarLengths
);
auto
inOutStrides
=
x
.
mDesc
.
GetStrides
();
auto
scaleBiasMeanVarStrides
=
bnScale
.
mDesc
.
GetStrides
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
haveSavedMeanInvVar
)
{
const
float
x_mean
=
0.0
f
;
const
float
x_stddev
=
1.0
f
;
const
float
noise_stddev
=
0.0001
f
;
// input data in normal distribution
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
// initialize the savedMean to be values with tiny variation to the mean of the x values
savedMean
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_mean
,
noise_stddev
},
num_thread
);
// initialize the variance to be values with tiny variation to the variance of the x values
savedVariance
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_stddev
*
x_stddev
,
noise_stddev
},
num_thread
);
auto
it_src
=
savedVariance
.
mData
.
begin
();
auto
it_dst
=
savedInvVar
.
mData
.
begin
();
float
tmp_epsilon
=
std
::
numeric_limits
<
float
>::
epsilon
();
while
(
it_src
!=
savedVariance
.
mData
.
end
())
{
*
it_dst
=
type_convert
<
AccDataType
>
(
1.0
f
/
std
::
sqrtf
(
type_convert
<
float
>
(
*
it_src
)
+
tmp_epsilon
));
it_src
++
;
it_dst
++
;
};
}
else
{
const
float
x_mean
=
0.0
f
;
const
float
x_stddev
=
1.0
f
;
// input data in normal distribution
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
};
if
(
do_verification
)
{
switch
(
init_method
)
{
case
0
:
dy
.
GenerateTensorValue
(
GeneratorTensor_0
<
DyDataType
>
{},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_0
<
ScaleDataType
>
{},
num_thread
);
break
;
case
1
:
dy
.
GenerateTensorValue
(
GeneratorTensor_1
<
DyDataType
>
{
1
},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_1
<
ScaleDataType
>
{
1
},
num_thread
);
break
;
case
2
:
dy
.
GenerateTensorValue
(
GeneratorTensor_2
<
DyDataType
>
{
-
2
,
2
},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_2
<
ScaleDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
dy
.
GenerateTensorValue
(
GeneratorTensor_3
<
DyDataType
>
{
-
0.2
f
,
0.2
f
},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_3
<
ScaleDataType
>
{
-
0.5
f
,
0.5
f
},
num_thread
);
}
};
// input data of the batchnorm backward algorithm
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dy_dev
(
sizeof
(
DyDataType
)
*
dy
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bnScale_dev
(
sizeof
(
ScaleDataType
)
*
bnScale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
savedMean_dev
(
sizeof
(
MeanVarDataType
)
*
savedMean
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
savedInvVar_dev
(
sizeof
(
MeanVarDataType
)
*
savedInvVar
.
mDesc
.
GetElementSpaceSize
());
// output data of the batchnorm backward algorithm
DeviceMem
dx_dev
(
sizeof
(
DxDataType
)
*
dx
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dscale_dev
(
sizeof
(
DscaleDbiasDataType
)
*
dscale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dbias_dev
(
sizeof
(
DscaleDbiasDataType
)
*
dbias
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
dy_dev
.
ToDevice
(
dy
.
mData
.
data
());
bnScale_dev
.
ToDevice
(
bnScale
.
mData
.
data
());
if
(
haveSavedMeanInvVar
)
{
savedMean_dev
.
ToDevice
(
savedMean
.
mData
.
data
());
savedInvVar_dev
.
ToDevice
(
savedInvVar
.
mData
.
data
());
};
std
::
array
<
index_t
,
Rank
>
arrInOutLengths
;
std
::
array
<
index_t
,
Rank
>
arrInOutStrides
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarLengths
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarStrides
;
std
::
array
<
int
,
NumBatchNormReduceDim
>
arrReduceDims
;
std
::
copy
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
arrInOutLengths
.
begin
());
std
::
copy
(
inOutStrides
.
begin
(),
inOutStrides
.
end
(),
arrInOutStrides
.
begin
());
std
::
copy
(
scaleBiasMeanVarLengths
.
begin
(),
scaleBiasMeanVarLengths
.
end
(),
arrScaleBiasMeanVarLengths
.
begin
());
std
::
copy
(
scaleBiasMeanVarStrides
.
begin
(),
scaleBiasMeanVarStrides
.
end
(),
arrScaleBiasMeanVarStrides
.
begin
());
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
arrReduceDims
.
begin
());
using
PassThroughOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
// add device batchnorm-backward instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchNormBwd
<
XDataType
,
DxDataType
,
DxDataType
,
AccDataType
,
ScaleDataType
,
DscaleDbiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferenceBatchNormBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormBwd
<
XDataType
,
DxDataType
,
DyDataType
,
AccDataType
,
ScaleDataType
,
DscaleDbiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
auto
batchNormBwd_ref
=
ReferenceBatchNormBwdInstance
{};
auto
argument_ptr_ref
=
batchNormBwd_ref
.
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x
.
mData
.
data
(),
dy
.
mData
.
data
(),
bnScale
.
mData
.
data
(),
haveSavedMeanInvVar
?
savedMean
.
mData
.
data
()
:
nullptr
,
haveSavedMeanInvVar
?
savedInvVar
.
mData
.
data
()
:
nullptr
,
epsilon
,
PassThroughOp
{},
dx_ref
.
mData
.
data
(),
dscale_ref
.
mData
.
data
(),
dbias_ref
.
mData
.
data
());
if
(
!
batchNormBwd_ref
.
IsSupportedArgument
(
argument_ptr_ref
.
get
()))
{
std
::
cout
<<
"The runtime parameters not supported by the reference instance, exiting!"
<<
std
::
endl
;
return
(
false
);
};
auto
invoker_ptr_ref
=
batchNormBwd_ref
.
MakeInvokerPointer
();
(
void
)
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
}
int
num_kernel
=
0
;
bool
pass
=
true
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x_dev
.
GetDeviceBuffer
(),
dy_dev
.
GetDeviceBuffer
(),
bnScale_dev
.
GetDeviceBuffer
(),
haveSavedMeanInvVar
?
savedMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
haveSavedMeanInvVar
?
savedInvVar_dev
.
GetDeviceBuffer
()
:
nullptr
,
epsilon
,
PassThroughOp
{},
dx_dev
.
GetDeviceBuffer
(),
dscale_dev
.
GetDeviceBuffer
(),
dbias_dev
.
GetDeviceBuffer
());
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
num_kernel
++
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
<<
std
::
endl
;
}
continue
;
};
size_t
workspace_sz
=
inst_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_dev
(
workspace_sz
);
inst_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
size_t
num_bytes
=
0
;
// inputing of x, dy, scale, outputing of dx, dscale, dbias
num_bytes
+=
total_length
*
(
sizeof
(
XDataType
)
+
sizeof
(
DyDataType
)
+
sizeof
(
DxDataType
))
+
invariant_length
*
sizeof
(
DscaleDbiasDataType
)
*
2
;
// inputting of savedMean, savedInvVariance
if
(
haveSavedMeanInvVar
)
num_bytes
+=
invariant_length
*
sizeof
(
MeanVarDataType
)
*
2
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
using
ck
::
utils
::
check_err
;
bool
single_pass
=
true
;
dx_dev
.
FromDevice
(
dx
.
mData
.
data
());
dscale_dev
.
FromDevice
(
dscale
.
data
());
dbias_dev
.
FromDevice
(
dbias
.
data
());
// clang-format off
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
dx
.
mData
,
dx_ref
.
mData
,
"dx result:"
,
5e-4
,
5e-4
);
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
dscale
.
mData
,
dscale_ref
.
mData
,
"dScale result:"
,
3e-3
,
3e-3
);
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
dbias
.
mData
,
dbias_ref
.
mData
,
"dBias result:"
,
3e-3
,
3e-3
);
// clang-format on
pass
=
pass
&&
single_pass
;
};
if
(
do_dumpout
)
{
using
ck
::
host_common
::
dumpBufferToFile
;
// clang-format off
dumpBufferToFile
(
"dump_x.bin"
,
x
.
mData
.
data
(),
x
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dy.bin"
,
dy
.
mData
.
data
(),
dy
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dx.bin"
,
dx
.
mData
.
data
(),
dx
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dx_ref.bin"
,
dx_ref
.
mData
.
data
(),
dx_ref
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dscale.bin"
,
dscale
.
mData
.
data
(),
dscale
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dscale_ref.bin"
,
dscale_ref
.
mData
.
data
(),
dscale_ref
.
mDesc
.
GetElementSize
());
// clang-format off
};
}
if
(
time_kernel
)
{
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_batchnorm_forward_impl.hpp
0 → 100644
View file @
05ee41c3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
XDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
ScaleDataType
,
typename
BiasDataType
,
typename
MeanVarDataType
,
index_t
Rank
,
index_t
NumBatchNormReduceDim
>
bool
profile_batchnorm_forward_impl
(
int
do_verification
,
int
init_method
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>
inOutLengths
,
const
std
::
vector
<
int
>
reduceDims
,
bool
updateMovingAverage
,
bool
saveMeanAndInvVariance
,
double
averageFactor
,
double
epsilon
)
{
if
(
inOutLengths
.
size
()
!=
Rank
||
reduceDims
.
size
()
!=
NumBatchNormReduceDim
)
{
throw
std
::
runtime_error
(
"Invalid tensor lengths or number of reduce dimensions!"
);
};
std
::
vector
<
size_t
>
scaleBiasMeanVarLengths
;
// used for calculating the effective transferred bytes by each operation
size_t
total_length
;
size_t
invariant_length
=
1
;
total_length
=
std
::
accumulate
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
1
,
std
::
multiplies
<
size_t
>
{});
if
(
std
::
any_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[](
int
d
)
{
return
d
<
0
||
d
>=
Rank
;
}))
throw
std
::
runtime_error
(
"Invalid reduce dimensions!"
);
for
(
int
dim
=
0
;
dim
<
Rank
;
dim
++
)
{
if
(
std
::
none_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[
&
](
int
d
)
{
return
dim
==
d
;
}))
{
scaleBiasMeanVarLengths
.
push_back
(
inOutLengths
[
dim
]);
invariant_length
*=
inOutLengths
[
dim
];
};
}
// input data of the batchnorm forward algorithm
Tensor
<
XDataType
>
x
(
inOutLengths
);
Tensor
<
ScaleDataType
>
bnScale
(
scaleBiasMeanVarLengths
);
Tensor
<
BiasDataType
>
bnBias
(
scaleBiasMeanVarLengths
);
// output data of the batchnorm forward algorithm
Tensor
<
YDataType
>
y_ref
(
inOutLengths
);
Tensor
<
YDataType
>
y
(
inOutLengths
);
Tensor
<
MeanVarDataType
>
resultSaveMean_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultSaveInvVariance_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultRunningMean_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultRunningVariance_ref
(
scaleBiasMeanVarLengths
);
auto
inOutStrides
=
x
.
mDesc
.
GetStrides
();
auto
scaleBiasMeanVarStrides
=
bnScale
.
mDesc
.
GetStrides
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
updateMovingAverage
)
{
const
float
x_mean
=
0.0
f
;
const
float
x_stddev
=
1.0
f
;
const
float
noise_stddev
=
0.04
f
;
// input data in normal distribution
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
// initialize the runningMean to be values with tiny variation to the mean of the x
// values
resultRunningMean_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_mean
,
noise_stddev
},
num_thread
);
// initialize the runningVariance to be values with tiny variation to the variance of
// the x values
resultRunningVariance_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_stddev
*
x_stddev
,
noise_stddev
},
num_thread
);
}
else
{
if
constexpr
(
ck
::
is_same_v
<
XDataType
,
int8_t
>
)
x
.
GenerateTensorValue
(
GeneratorTensor_2
<
XDataType
>
{
-
5
,
5
},
num_thread
);
else
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
};
if
(
do_verification
)
{
switch
(
init_method
)
{
case
0
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_0
<
ScaleDataType
>
{},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_0
<
BiasDataType
>
{},
num_thread
);
break
;
case
1
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_1
<
ScaleDataType
>
{
1
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_1
<
BiasDataType
>
{
0
},
num_thread
);
break
;
case
2
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_2
<
ScaleDataType
>
{
-
5
,
5
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_2
<
BiasDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_3
<
ScaleDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_3
<
BiasDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
}
};
// these buffers are usually provided by the user application
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
XDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bnScale_dev
(
sizeof
(
ScaleDataType
)
*
bnScale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bnBias_dev
(
sizeof
(
BiasDataType
)
*
bnBias
.
mDesc
.
GetElementSpaceSize
());
// mean_dev or resultSaveMean_dev
DeviceMem
resultSaveMean_dev
(
sizeof
(
MeanVarDataType
)
*
resultSaveMean_ref
.
mDesc
.
GetElementSpaceSize
());
// meansquare_dev or resultSaveInvVariance_dev
DeviceMem
resultSaveInvVariance_dev
(
sizeof
(
MeanVarDataType
)
*
resultSaveInvVariance_ref
.
mDesc
.
GetElementSpaceSize
());
// resultRunningMean_dev
DeviceMem
resultRunningMean_dev
(
sizeof
(
MeanVarDataType
)
*
resultRunningMean_ref
.
mDesc
.
GetElementSpaceSize
());
// resultRunningVariance_dev
DeviceMem
resultRunningVariance_dev
(
sizeof
(
MeanVarDataType
)
*
resultRunningVariance_ref
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
bnScale_dev
.
ToDevice
(
bnScale
.
mData
.
data
());
bnBias_dev
.
ToDevice
(
bnBias
.
mData
.
data
());
if
(
updateMovingAverage
)
{
resultRunningMean_dev
.
ToDevice
(
resultRunningMean_ref
.
mData
.
data
());
resultRunningVariance_dev
.
ToDevice
(
resultRunningVariance_ref
.
mData
.
data
());
};
// used for storing the device result for verification when updateMovingAverage is enabled
Tensor
<
MeanVarDataType
>
resultRunningMean
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultRunningVariance
(
scaleBiasMeanVarLengths
);
// used for storing the device result for verification when saveMeanAndInvVariance is enabled
Tensor
<
MeanVarDataType
>
resultSaveMean
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultSaveInvVariance
(
scaleBiasMeanVarLengths
);
std
::
array
<
index_t
,
Rank
>
arrInOutLengths
;
std
::
array
<
index_t
,
Rank
>
arrInOutStrides
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarLengths
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarStrides
;
std
::
array
<
int
,
NumBatchNormReduceDim
>
arrReduceDims
;
std
::
copy
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
arrInOutLengths
.
begin
());
std
::
copy
(
inOutStrides
.
begin
(),
inOutStrides
.
end
(),
arrInOutStrides
.
begin
());
std
::
copy
(
scaleBiasMeanVarLengths
.
begin
(),
scaleBiasMeanVarLengths
.
end
(),
arrScaleBiasMeanVarLengths
.
begin
());
std
::
copy
(
scaleBiasMeanVarStrides
.
begin
(),
scaleBiasMeanVarStrides
.
end
(),
arrScaleBiasMeanVarStrides
.
begin
());
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
arrReduceDims
.
begin
());
using
PassThroughOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
// add device batchnorm-forward instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchNormFwd
<
XDataType
,
YDataType
,
AccDataType
,
ScaleDataType
,
BiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferenceBatchNormFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormFwd
<
XDataType
,
YDataType
,
AccDataType
,
ScaleDataType
,
BiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
auto
batchNormFwd_ref
=
ReferenceBatchNormFwdInstance
{};
auto
argument_ptr_ref
=
batchNormFwd_ref
.
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x
.
mData
.
data
(),
bnScale
.
mData
.
data
(),
bnBias
.
mData
.
data
(),
epsilon
,
PassThroughOp
{},
y_ref
.
mData
.
data
(),
saveMeanAndInvVariance
?
resultSaveMean_ref
.
mData
.
data
()
:
nullptr
,
saveMeanAndInvVariance
?
resultSaveInvVariance_ref
.
mData
.
data
()
:
nullptr
,
averageFactor
,
updateMovingAverage
?
resultRunningMean_ref
.
mData
.
data
()
:
nullptr
,
updateMovingAverage
?
resultRunningVariance_ref
.
mData
.
data
()
:
nullptr
);
if
(
!
batchNormFwd_ref
.
IsSupportedArgument
(
argument_ptr_ref
.
get
()))
{
std
::
cout
<<
"The runtime parameters not supported by the reference instance, exiting!"
<<
std
::
endl
;
return
(
false
);
};
auto
invoker_ptr_ref
=
batchNormFwd_ref
.
MakeInvokerPointer
();
(
void
)
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
}
int
num_kernel
=
0
;
bool
pass
=
true
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x_dev
.
GetDeviceBuffer
(),
bnScale_dev
.
GetDeviceBuffer
(),
bnBias_dev
.
GetDeviceBuffer
(),
epsilon
,
PassThroughOp
{},
y_dev
.
GetDeviceBuffer
(),
saveMeanAndInvVariance
?
resultSaveMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
saveMeanAndInvVariance
?
resultSaveInvVariance_dev
.
GetDeviceBuffer
()
:
nullptr
,
averageFactor
,
updateMovingAverage
?
resultRunningMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
updateMovingAverage
?
resultRunningVariance_dev
.
GetDeviceBuffer
()
:
nullptr
);
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
num_kernel
++
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
<<
std
::
endl
;
}
continue
;
};
size_t
workspace_sz
=
inst_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_dev
(
workspace_sz
);
inst_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
size_t
num_bytes
=
0
;
// inputing of x, scale, bias, outputing of y
num_bytes
+=
total_length
*
(
sizeof
(
XDataType
)
+
sizeof
(
YDataType
))
+
invariant_length
*
(
sizeof
(
ScaleDataType
)
+
sizeof
(
BiasDataType
));
// outputing of mean, inv-variance
num_bytes
+=
saveMeanAndInvVariance
?
invariant_length
*
sizeof
(
MeanVarDataType
)
*
2
:
0
;
// updating of moving mean, variance
num_bytes
+=
updateMovingAverage
?
invariant_length
*
sizeof
(
MeanVarDataType
)
*
4
:
0
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
using
ck
::
utils
::
check_err
;
bool
single_pass
;
y_dev
.
FromDevice
(
y
.
mData
.
data
());
if
constexpr
(
ck
::
is_same_v
<
YDataType
,
ck
::
bhalf_t
>
)
single_pass
=
check_err
(
y
.
mData
,
y_ref
.
mData
,
"y results"
,
1e-2
,
1e-2
);
else
single_pass
=
check_err
(
y
.
mData
,
y_ref
.
mData
,
"y results"
,
4e-3
,
4e-3
);
if
(
updateMovingAverage
)
{
resultRunningMean_dev
.
FromDevice
(
resultRunningMean
.
mData
.
data
());
resultRunningVariance_dev
.
FromDevice
(
resultRunningVariance
.
mData
.
data
());
// clang-format off
single_pass
=
single_pass
&&
check_err
(
resultRunningMean
.
mData
,
resultRunningMean_ref
.
mData
,
"average mean results"
,
1.5e-5
,
1.5e-5
);
single_pass
=
single_pass
&&
check_err
(
resultRunningVariance
.
mData
,
resultRunningVariance_ref
.
mData
,
"average variance results"
,
1e-5
,
1e-5
);
// clang-format on
};
if
(
saveMeanAndInvVariance
)
{
resultSaveMean_dev
.
FromDevice
(
resultSaveMean
.
mData
.
data
());
resultSaveInvVariance_dev
.
FromDevice
(
resultSaveInvVariance
.
mData
.
data
());
// clang-format off
single_pass
=
single_pass
&&
check_err
(
resultSaveMean
.
mData
,
resultSaveMean_ref
.
mData
,
"mean results"
,
3e-5
,
3e-5
);
single_pass
=
single_pass
&&
check_err
(
resultSaveInvVariance
.
mData
,
resultSaveInvVariance_ref
.
mData
,
"inv-variance results"
,
7e-5
,
7e-5
);
// clang-format on
};
pass
=
pass
&&
single_pass
;
};
if
(
do_dumpout
)
{
using
ck
::
host_common
::
dumpBufferToFile
;
// clang-format off
dumpBufferToFile
(
"dump_x.bin"
,
x
.
mData
.
data
(),
x
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_y.bin"
,
y
.
mData
.
data
(),
y
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_y_ref.bin"
,
y_ref
.
mData
.
data
(),
y_ref
.
mDesc
.
GetElementSize
());
// clang-format off
if
(
saveMeanAndInvVariance
)
{
// clang-format off
dumpBufferToFile
(
"dump_mean.bin"
,
resultSaveMean
.
mData
.
data
(),
resultSaveMean
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_mean_ref.bin"
,
resultSaveMean_ref
.
mData
.
data
(),
resultSaveMean_ref
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_invvar.bin"
,
resultSaveInvVariance
.
mData
.
data
(),
resultSaveInvVariance
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_invvar_ref.bin"
,
resultSaveInvVariance_ref
.
mData
.
data
(),
resultSaveInvVariance_ref
.
mDesc
.
GetElementSize
());
// clang-format on
};
};
}
if
(
time_kernel
)
{
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_conv_bwd_data_impl.hpp
View file @
05ee41c3
...
...
@@ -209,8 +209,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
{
in_device_buf
.
FromDevice
(
input_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
input_device_result
.
mData
,
input_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
input_device_result
,
input_host_result
);
if
(
do_log
)
{
...
...
Prev
1
…
15
16
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment