Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
289f15de
Commit
289f15de
authored
Dec 09, 2022
by
aska-0096
Browse files
Merge branch 'develop' of
https://github.com/ROCmSoftwarePlatform/composable_kernel
into wmma_gemm
parents
9bd44685
d58b7f51
Changes
371
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
913 additions
and
151 deletions
+913
-151
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+2
-56
profiler/include/profiler/data_type_enum.hpp
profiler/include/profiler/data_type_enum.hpp
+0
-0
profiler/include/profiler/data_type_enum_helper.hpp
profiler/include/profiler/data_type_enum_helper.hpp
+1
-1
profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
.../profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
+6
-6
profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+6
-6
profiler/include/profiler/profile_batched_gemm_impl.hpp
profiler/include/profiler/profile_batched_gemm_impl.hpp
+6
-6
profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
...ler/include/profiler/profile_batched_gemm_reduce_impl.hpp
+12
-18
profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
...clude/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+11
-7
profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
...ofiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+25
-3
profiler/include/profiler/profile_batchnorm_backward_impl.hpp
...iler/include/profiler/profile_batchnorm_backward_impl.hpp
+390
-0
profiler/include/profiler/profile_batchnorm_forward_impl.hpp
profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+412
-0
profiler/include/profiler/profile_conv_bwd_data_impl.hpp
profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+1
-2
profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
.../include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
+7
-8
profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
...iler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
+7
-8
profiler/include/profiler/profile_conv_fwd_impl.hpp
profiler/include/profiler/profile_conv_fwd_impl.hpp
+1
-1
profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
+1
-1
profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
+1
-1
profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
...r/include/profiler/profile_elementwise_layernorm_impl.hpp
+4
-2
profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
...r/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
+7
-8
profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
...er/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
+13
-17
No files found.
profiler/CMakeLists.txt
View file @
289f15de
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/
${
CMAKE_CURRENT_LIST_DIR
}
/include
)
# ck_profiler
set
(
PROFILER_SOURCE
src/profiler.cpp
src/profile_gemm.cpp
src/profile_gemm_splitk.cpp
src/profile_gemm_bilinear.cpp
src/profile_gemm_bias_add_reduce.cpp
src/profile_gemm_add_add_fastgelu.cpp
src/profile_gemm_reduce.cpp
src/profile_batched_gemm.cpp
src/profile_batched_gemm_gemm.cpp
src/profile_batched_gemm_add_relu_gemm_add.cpp
src/profile_batched_gemm_reduce.cpp
src/profile_grouped_gemm.cpp
src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_bwd_data.cpp
src/profile_conv_bwd_weight.cpp
src/profile_grouped_conv_fwd.cpp
src/profile_reduce.cpp
src/profile_groupnorm.cpp
src/profile_layernorm.cpp
src/profile_softmax.cpp
)
add_executable
(
ckProfiler
${
PROFILER_SOURCE
}
)
target_link_libraries
(
ckProfiler PRIVATE utility
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_splitk_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bilinear_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias_add_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_add_relu_gemm_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_grouped_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_grouped_conv1d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_grouped_conv2d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_grouped_conv3d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv1d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv3d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv1d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv3d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_normalization_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_softmax_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
add_subdirectory
(
src
)
profiler/include/data_type_enum.hpp
→
profiler/include/
profiler/
data_type_enum.hpp
View file @
289f15de
File moved
profiler/include/data_type_enum_helper.hpp
→
profiler/include/
profiler/
data_type_enum_helper.hpp
View file @
289f15de
...
...
@@ -4,7 +4,7 @@
#pragma
#include "ck/utility/data_type.hpp"
#include "profiler/
include/
data_type_enum.hpp"
#include "profiler/data_type_enum.hpp"
namespace
ck
{
...
...
profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
→
profiler/include/
profiler/
profile_batched_gemm_add_relu_gemm_add_impl.hpp
View file @
289f15de
...
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -111,15 +112,15 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -330,8 +331,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
{
e1_g_m_o_device_buf
.
FromDevice
(
e1_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
.
mData
,
e1_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
,
e1_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_gemm_impl.hpp
→
profiler/include/
profiler/
profile_batched_gemm_gemm_impl.hpp
View file @
289f15de
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -105,15 +106,15 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -283,8 +284,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
{
c_g_m_o_device_buf
.
FromDevice
(
c_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
.
mData
,
c_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
,
c_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_impl.hpp
→
profiler/include/
profiler/
profile_batched_gemm_impl.hpp
View file @
289f15de
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -50,15 +51,15 @@ bool profile_batched_gemm_impl(int do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -202,8 +203,7 @@ bool profile_batched_gemm_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_n_device_result
.
mData
,
c_g_m_n_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_n_device_result
,
c_g_m_n_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_reduce_impl.hpp
→
profiler/include/
profiler/
profile_batched_gemm_reduce_impl.hpp
View file @
289f15de
...
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -78,15 +79,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
row
*
stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
col
*
stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -95,17 +96,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d0_g_m_host_result
({
BatchCount
,
M
});
Tensor
<
ReduceDataType
>
d1_g_m_host_result
({
BatchCount
,
M
});
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d0_g_m_device_result
({
BatchCount
,
M
});
Tensor
<
ReduceDataType
>
d1_g_m_device_result
({
BatchCount
,
M
});
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
...
...
@@ -319,12 +316,9 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
reduce0_device_buf
.
FromDevice
(
d0_g_m_device_result
.
mData
.
data
());
reduce1_device_buf
.
FromDevice
(
d1_g_m_device_result
.
mData
.
data
());
bool
c_error
=
ck
::
utils
::
check_err
(
c_g_m_n_device_result
.
mData
,
c_g_m_n_host_result
.
mData
);
bool
d0_error
=
ck
::
utils
::
check_err
(
d0_g_m_device_result
.
mData
,
d0_g_m_host_result
.
mData
);
bool
d1_error
=
ck
::
utils
::
check_err
(
d1_g_m_device_result
.
mData
,
d1_g_m_host_result
.
mData
);
bool
c_error
=
ck
::
utils
::
check_err
(
c_g_m_n_device_result
,
c_g_m_n_host_result
);
bool
d0_error
=
ck
::
utils
::
check_err
(
d0_g_m_device_result
,
d0_g_m_host_result
);
bool
d1_error
=
ck
::
utils
::
check_err
(
d1_g_m_device_result
,
d1_g_m_host_result
);
pass
=
pass
&&
(
c_error
==
true
);
pass
=
pass
&&
(
d0_error
==
true
);
...
...
profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
→
profiler/include/
profiler/
profile_batched_gemm_softmax_gemm_impl.hpp
View file @
289f15de
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
...
...
@@ -48,7 +49,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
int
BatchStrideB0
=
-
1
,
int
BatchStrideB1
=
-
1
,
int
BatchStrideC
=
-
1
,
float
alpha
=
1.
f
)
float
alpha
=
-
1.
f
)
{
...
...
@@ -113,15 +114,15 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -186,6 +187,10 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
b0_g_k_n_device_buf
.
ToDevice
(
b0_g_k_n
.
mData
.
data
());
b1_g_n_o_device_buf
.
ToDevice
(
b1_g_n_o
.
mData
.
data
());
if
(
alpha
<
0
)
{
alpha
=
1.
f
/
std
::
sqrt
(
K
);
// usually 1 / sqrt(head_dim)
}
auto
a_element_op
=
AElementOp
{};
auto
b0_element_op
=
B0ElementOp
{};
auto
acc0_element_op
=
Acc0ElementOp
{
alpha
};
...
...
@@ -307,8 +312,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
{
c_g_m_o_device_buf
.
FromDevice
(
c_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
.
mData
,
c_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
,
c_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp
→
profiler/include/
profiler/
profile_batched_gemm_softmax_gemm_permute_impl.hpp
View file @
289f15de
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
...
...
@@ -44,7 +45,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
int
O
,
int
G0
,
int
G1
,
float
alpha
=
1.
f
)
float
alpha
=
-
1.
f
)
{
...
...
@@ -153,6 +154,10 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
b0_device_buf
.
ToDevice
(
b0_gs_ns_ks
.
mData
.
data
());
b1_device_buf
.
ToDevice
(
b1_gs_os_ns
.
mData
.
data
());
if
(
alpha
<
0
)
{
alpha
=
1.
f
/
std
::
sqrt
(
K
);
// usually 1 / sqrt(head_dim)
}
auto
a_element_op
=
AElementOp
{};
auto
b0_element_op
=
B0ElementOp
{};
auto
acc0_element_op
=
Acc0ElementOp
{
alpha
};
...
...
@@ -308,8 +313,25 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
{
c_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
.
mData
,
c_gs_ms_os_host_result
.
mData
);
// default absolute error and relative error is 0.001
double
rtol
=
1e-3
;
double
atol
=
1e-3
;
// when BF16 is taken, set absolute error and relative error to 0.01
if
(
std
::
is_same_v
<
ADataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
B0DataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
B1DataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
CDataType
,
ck
::
bhalf_t
>
)
{
rtol
=
1e-2
;
atol
=
1e-2
;
}
pass
=
pass
&
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
,
c_gs_ms_os_host_result
,
"Error: Incorrect results!"
,
rtol
,
atol
);
if
(
do_log
)
{
...
...
profiler/include/profiler/profile_batchnorm_backward_impl.hpp
0 → 100644
View file @
289f15de
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
XDataType
,
typename
DxDataType
,
typename
DyDataType
,
typename
AccDataType
,
typename
ScaleDataType
,
typename
DscaleDbiasDataType
,
typename
MeanVarDataType
,
index_t
Rank
,
index_t
NumBatchNormReduceDim
>
bool
profile_batchnorm_backward_impl
(
bool
do_verification
,
int
init_method
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>
inOutLengths
,
const
std
::
vector
<
int
>
reduceDims
,
bool
haveSavedMeanInvVar
,
double
epsilon
)
{
if
(
inOutLengths
.
size
()
!=
Rank
||
reduceDims
.
size
()
!=
NumBatchNormReduceDim
)
{
throw
std
::
runtime_error
(
"Invalid tensor lengths or number of reduce dimensions!"
);
};
std
::
vector
<
size_t
>
scaleBiasMeanVarLengths
;
// used for calculating the effective transferred bytes by each operation
size_t
total_length
;
size_t
invariant_length
=
1
;
total_length
=
std
::
accumulate
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
1
,
std
::
multiplies
<
size_t
>
{});
if
(
std
::
any_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[](
int
d
)
{
return
d
<
0
||
d
>=
Rank
;
}))
throw
std
::
runtime_error
(
"Invalid reduce dimensions!"
);
for
(
int
dim
=
0
;
dim
<
Rank
;
dim
++
)
{
if
(
std
::
none_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[
&
](
int
d
)
{
return
dim
==
d
;
}))
{
scaleBiasMeanVarLengths
.
push_back
(
inOutLengths
[
dim
]);
invariant_length
*=
inOutLengths
[
dim
];
};
}
// input data of the batchnorm backward algorithm
Tensor
<
XDataType
>
x
(
inOutLengths
);
Tensor
<
DyDataType
>
dy
(
inOutLengths
);
Tensor
<
ScaleDataType
>
bnScale
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
savedMean
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
savedInvVar
(
scaleBiasMeanVarLengths
);
// savedVariance is only used for initializing savedInvVar
Tensor
<
MeanVarDataType
>
savedVariance
(
scaleBiasMeanVarLengths
);
// output data of the batchnorm backward algorithm
Tensor
<
DxDataType
>
dx_ref
(
inOutLengths
);
Tensor
<
DxDataType
>
dx
(
inOutLengths
);
Tensor
<
DscaleDbiasDataType
>
dscale
(
scaleBiasMeanVarLengths
);
Tensor
<
DscaleDbiasDataType
>
dbias
(
scaleBiasMeanVarLengths
);
Tensor
<
DscaleDbiasDataType
>
dscale_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
DscaleDbiasDataType
>
dbias_ref
(
scaleBiasMeanVarLengths
);
auto
inOutStrides
=
x
.
mDesc
.
GetStrides
();
auto
scaleBiasMeanVarStrides
=
bnScale
.
mDesc
.
GetStrides
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
haveSavedMeanInvVar
)
{
const
float
x_mean
=
0.0
f
;
const
float
x_stddev
=
1.0
f
;
const
float
noise_stddev
=
0.0001
f
;
// input data in normal distribution
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
// initialize the savedMean to be values with tiny variation to the mean of the x values
savedMean
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_mean
,
noise_stddev
},
num_thread
);
// initialize the variance to be values with tiny variation to the variance of the x values
savedVariance
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_stddev
*
x_stddev
,
noise_stddev
},
num_thread
);
auto
it_src
=
savedVariance
.
mData
.
begin
();
auto
it_dst
=
savedInvVar
.
mData
.
begin
();
float
tmp_epsilon
=
std
::
numeric_limits
<
float
>::
epsilon
();
while
(
it_src
!=
savedVariance
.
mData
.
end
())
{
*
it_dst
=
type_convert
<
AccDataType
>
(
1.0
f
/
std
::
sqrtf
(
type_convert
<
float
>
(
*
it_src
)
+
tmp_epsilon
));
it_src
++
;
it_dst
++
;
};
}
else
{
const
float
x_mean
=
0.0
f
;
const
float
x_stddev
=
1.0
f
;
// input data in normal distribution
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
};
if
(
do_verification
)
{
switch
(
init_method
)
{
case
0
:
dy
.
GenerateTensorValue
(
GeneratorTensor_0
<
DyDataType
>
{},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_0
<
ScaleDataType
>
{},
num_thread
);
break
;
case
1
:
dy
.
GenerateTensorValue
(
GeneratorTensor_1
<
DyDataType
>
{
1
},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_1
<
ScaleDataType
>
{
1
},
num_thread
);
break
;
case
2
:
dy
.
GenerateTensorValue
(
GeneratorTensor_2
<
DyDataType
>
{
-
2
,
2
},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_2
<
ScaleDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
dy
.
GenerateTensorValue
(
GeneratorTensor_3
<
DyDataType
>
{
-
0.2
f
,
0.2
f
},
num_thread
);
bnScale
.
GenerateTensorValue
(
GeneratorTensor_3
<
ScaleDataType
>
{
-
0.5
f
,
0.5
f
},
num_thread
);
}
};
// input data of the batchnorm backward algorithm
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dy_dev
(
sizeof
(
DyDataType
)
*
dy
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bnScale_dev
(
sizeof
(
ScaleDataType
)
*
bnScale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
savedMean_dev
(
sizeof
(
MeanVarDataType
)
*
savedMean
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
savedInvVar_dev
(
sizeof
(
MeanVarDataType
)
*
savedInvVar
.
mDesc
.
GetElementSpaceSize
());
// output data of the batchnorm backward algorithm
DeviceMem
dx_dev
(
sizeof
(
DxDataType
)
*
dx
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dscale_dev
(
sizeof
(
DscaleDbiasDataType
)
*
dscale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dbias_dev
(
sizeof
(
DscaleDbiasDataType
)
*
dbias
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
dy_dev
.
ToDevice
(
dy
.
mData
.
data
());
bnScale_dev
.
ToDevice
(
bnScale
.
mData
.
data
());
if
(
haveSavedMeanInvVar
)
{
savedMean_dev
.
ToDevice
(
savedMean
.
mData
.
data
());
savedInvVar_dev
.
ToDevice
(
savedInvVar
.
mData
.
data
());
};
std
::
array
<
index_t
,
Rank
>
arrInOutLengths
;
std
::
array
<
index_t
,
Rank
>
arrInOutStrides
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarLengths
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarStrides
;
std
::
array
<
int
,
NumBatchNormReduceDim
>
arrReduceDims
;
std
::
copy
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
arrInOutLengths
.
begin
());
std
::
copy
(
inOutStrides
.
begin
(),
inOutStrides
.
end
(),
arrInOutStrides
.
begin
());
std
::
copy
(
scaleBiasMeanVarLengths
.
begin
(),
scaleBiasMeanVarLengths
.
end
(),
arrScaleBiasMeanVarLengths
.
begin
());
std
::
copy
(
scaleBiasMeanVarStrides
.
begin
(),
scaleBiasMeanVarStrides
.
end
(),
arrScaleBiasMeanVarStrides
.
begin
());
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
arrReduceDims
.
begin
());
using
PassThroughOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
// add device batchnorm-backward instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchNormBwd
<
XDataType
,
DxDataType
,
DxDataType
,
AccDataType
,
ScaleDataType
,
DscaleDbiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferenceBatchNormBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormBwd
<
XDataType
,
DxDataType
,
DyDataType
,
AccDataType
,
ScaleDataType
,
DscaleDbiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
auto
batchNormBwd_ref
=
ReferenceBatchNormBwdInstance
{};
auto
argument_ptr_ref
=
batchNormBwd_ref
.
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x
.
mData
.
data
(),
dy
.
mData
.
data
(),
bnScale
.
mData
.
data
(),
haveSavedMeanInvVar
?
savedMean
.
mData
.
data
()
:
nullptr
,
haveSavedMeanInvVar
?
savedInvVar
.
mData
.
data
()
:
nullptr
,
epsilon
,
PassThroughOp
{},
dx_ref
.
mData
.
data
(),
dscale_ref
.
mData
.
data
(),
dbias_ref
.
mData
.
data
());
if
(
!
batchNormBwd_ref
.
IsSupportedArgument
(
argument_ptr_ref
.
get
()))
{
std
::
cout
<<
"The runtime parameters not supported by the reference instance, exiting!"
<<
std
::
endl
;
return
(
false
);
};
auto
invoker_ptr_ref
=
batchNormBwd_ref
.
MakeInvokerPointer
();
(
void
)
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
}
int
num_kernel
=
0
;
bool
pass
=
true
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x_dev
.
GetDeviceBuffer
(),
dy_dev
.
GetDeviceBuffer
(),
bnScale_dev
.
GetDeviceBuffer
(),
haveSavedMeanInvVar
?
savedMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
haveSavedMeanInvVar
?
savedInvVar_dev
.
GetDeviceBuffer
()
:
nullptr
,
epsilon
,
PassThroughOp
{},
dx_dev
.
GetDeviceBuffer
(),
dscale_dev
.
GetDeviceBuffer
(),
dbias_dev
.
GetDeviceBuffer
());
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
num_kernel
++
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
<<
std
::
endl
;
}
continue
;
};
size_t
workspace_sz
=
inst_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_dev
(
workspace_sz
);
inst_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
size_t
num_bytes
=
0
;
// inputing of x, dy, scale, outputing of dx, dscale, dbias
num_bytes
+=
total_length
*
(
sizeof
(
XDataType
)
+
sizeof
(
DyDataType
)
+
sizeof
(
DxDataType
))
+
invariant_length
*
sizeof
(
DscaleDbiasDataType
)
*
2
;
// inputting of savedMean, savedInvVariance
if
(
haveSavedMeanInvVar
)
num_bytes
+=
invariant_length
*
sizeof
(
MeanVarDataType
)
*
2
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
using
ck
::
utils
::
check_err
;
bool
single_pass
=
true
;
dx_dev
.
FromDevice
(
dx
.
mData
.
data
());
dscale_dev
.
FromDevice
(
dscale
.
data
());
dbias_dev
.
FromDevice
(
dbias
.
data
());
// clang-format off
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
dx
.
mData
,
dx_ref
.
mData
,
"dx result:"
,
5e-4
,
5e-4
);
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
dscale
.
mData
,
dscale_ref
.
mData
,
"dScale result:"
,
3e-3
,
3e-3
);
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
dbias
.
mData
,
dbias_ref
.
mData
,
"dBias result:"
,
3e-3
,
3e-3
);
// clang-format on
pass
=
pass
&&
single_pass
;
};
if
(
do_dumpout
)
{
using
ck
::
host_common
::
dumpBufferToFile
;
// clang-format off
dumpBufferToFile
(
"dump_x.bin"
,
x
.
mData
.
data
(),
x
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dy.bin"
,
dy
.
mData
.
data
(),
dy
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dx.bin"
,
dx
.
mData
.
data
(),
dx
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dx_ref.bin"
,
dx_ref
.
mData
.
data
(),
dx_ref
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dscale.bin"
,
dscale
.
mData
.
data
(),
dscale
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_dscale_ref.bin"
,
dscale_ref
.
mData
.
data
(),
dscale_ref
.
mDesc
.
GetElementSize
());
// clang-format off
};
}
if
(
time_kernel
)
{
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profiler/profile_batchnorm_forward_impl.hpp
0 → 100644
View file @
289f15de
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
XDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
ScaleDataType
,
typename
BiasDataType
,
typename
MeanVarDataType
,
index_t
Rank
,
index_t
NumBatchNormReduceDim
>
bool
profile_batchnorm_forward_impl
(
int
do_verification
,
int
init_method
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>
inOutLengths
,
const
std
::
vector
<
int
>
reduceDims
,
bool
updateMovingAverage
,
bool
saveMeanAndInvVariance
,
double
averageFactor
,
double
epsilon
)
{
if
(
inOutLengths
.
size
()
!=
Rank
||
reduceDims
.
size
()
!=
NumBatchNormReduceDim
)
{
throw
std
::
runtime_error
(
"Invalid tensor lengths or number of reduce dimensions!"
);
};
std
::
vector
<
size_t
>
scaleBiasMeanVarLengths
;
// used for calculating the effective transferred bytes by each operation
size_t
total_length
;
size_t
invariant_length
=
1
;
total_length
=
std
::
accumulate
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
1
,
std
::
multiplies
<
size_t
>
{});
if
(
std
::
any_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[](
int
d
)
{
return
d
<
0
||
d
>=
Rank
;
}))
throw
std
::
runtime_error
(
"Invalid reduce dimensions!"
);
for
(
int
dim
=
0
;
dim
<
Rank
;
dim
++
)
{
if
(
std
::
none_of
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
[
&
](
int
d
)
{
return
dim
==
d
;
}))
{
scaleBiasMeanVarLengths
.
push_back
(
inOutLengths
[
dim
]);
invariant_length
*=
inOutLengths
[
dim
];
};
}
// input data of the batchnorm forward algorithm
Tensor
<
XDataType
>
x
(
inOutLengths
);
Tensor
<
ScaleDataType
>
bnScale
(
scaleBiasMeanVarLengths
);
Tensor
<
BiasDataType
>
bnBias
(
scaleBiasMeanVarLengths
);
// output data of the batchnorm forward algorithm
Tensor
<
YDataType
>
y_ref
(
inOutLengths
);
Tensor
<
YDataType
>
y
(
inOutLengths
);
Tensor
<
MeanVarDataType
>
resultSaveMean_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultSaveInvVariance_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultRunningMean_ref
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultRunningVariance_ref
(
scaleBiasMeanVarLengths
);
auto
inOutStrides
=
x
.
mDesc
.
GetStrides
();
auto
scaleBiasMeanVarStrides
=
bnScale
.
mDesc
.
GetStrides
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
updateMovingAverage
)
{
const
float
x_mean
=
0.0
f
;
const
float
x_stddev
=
1.0
f
;
const
float
noise_stddev
=
0.04
f
;
// input data in normal distribution
x
.
GenerateTensorValue
(
GeneratorTensor_4
<
XDataType
>
{
x_mean
,
x_stddev
},
num_thread
);
// initialize the runningMean to be values with tiny variation to the mean of the x
// values
resultRunningMean_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_mean
,
noise_stddev
},
num_thread
);
// initialize the runningVariance to be values with tiny variation to the variance of
// the x values
resultRunningVariance_ref
.
GenerateTensorValue
(
GeneratorTensor_4
<
MeanVarDataType
>
{
x_stddev
*
x_stddev
,
noise_stddev
},
num_thread
);
}
else
{
if
constexpr
(
ck
::
is_same_v
<
XDataType
,
int8_t
>
)
x
.
GenerateTensorValue
(
GeneratorTensor_2
<
XDataType
>
{
-
5
,
5
},
num_thread
);
else
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
};
if
(
do_verification
)
{
switch
(
init_method
)
{
case
0
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_0
<
ScaleDataType
>
{},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_0
<
BiasDataType
>
{},
num_thread
);
break
;
case
1
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_1
<
ScaleDataType
>
{
1
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_1
<
BiasDataType
>
{
0
},
num_thread
);
break
;
case
2
:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_2
<
ScaleDataType
>
{
-
5
,
5
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_2
<
BiasDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
bnScale
.
GenerateTensorValue
(
GeneratorTensor_3
<
ScaleDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
bnBias
.
GenerateTensorValue
(
GeneratorTensor_3
<
BiasDataType
>
{
-
1.0
f
,
1.0
f
},
num_thread
);
}
};
// these buffers are usually provided by the user application
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
XDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bnScale_dev
(
sizeof
(
ScaleDataType
)
*
bnScale
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bnBias_dev
(
sizeof
(
BiasDataType
)
*
bnBias
.
mDesc
.
GetElementSpaceSize
());
// mean_dev or resultSaveMean_dev
DeviceMem
resultSaveMean_dev
(
sizeof
(
MeanVarDataType
)
*
resultSaveMean_ref
.
mDesc
.
GetElementSpaceSize
());
// meansquare_dev or resultSaveInvVariance_dev
DeviceMem
resultSaveInvVariance_dev
(
sizeof
(
MeanVarDataType
)
*
resultSaveInvVariance_ref
.
mDesc
.
GetElementSpaceSize
());
// resultRunningMean_dev
DeviceMem
resultRunningMean_dev
(
sizeof
(
MeanVarDataType
)
*
resultRunningMean_ref
.
mDesc
.
GetElementSpaceSize
());
// resultRunningVariance_dev
DeviceMem
resultRunningVariance_dev
(
sizeof
(
MeanVarDataType
)
*
resultRunningVariance_ref
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
bnScale_dev
.
ToDevice
(
bnScale
.
mData
.
data
());
bnBias_dev
.
ToDevice
(
bnBias
.
mData
.
data
());
if
(
updateMovingAverage
)
{
resultRunningMean_dev
.
ToDevice
(
resultRunningMean_ref
.
mData
.
data
());
resultRunningVariance_dev
.
ToDevice
(
resultRunningVariance_ref
.
mData
.
data
());
};
// used for storing the device result for verification when updateMovingAverage is enabled
Tensor
<
MeanVarDataType
>
resultRunningMean
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultRunningVariance
(
scaleBiasMeanVarLengths
);
// used for storing the device result for verification when saveMeanAndInvVariance is enabled
Tensor
<
MeanVarDataType
>
resultSaveMean
(
scaleBiasMeanVarLengths
);
Tensor
<
MeanVarDataType
>
resultSaveInvVariance
(
scaleBiasMeanVarLengths
);
std
::
array
<
index_t
,
Rank
>
arrInOutLengths
;
std
::
array
<
index_t
,
Rank
>
arrInOutStrides
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarLengths
;
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
arrScaleBiasMeanVarStrides
;
std
::
array
<
int
,
NumBatchNormReduceDim
>
arrReduceDims
;
std
::
copy
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
arrInOutLengths
.
begin
());
std
::
copy
(
inOutStrides
.
begin
(),
inOutStrides
.
end
(),
arrInOutStrides
.
begin
());
std
::
copy
(
scaleBiasMeanVarLengths
.
begin
(),
scaleBiasMeanVarLengths
.
end
(),
arrScaleBiasMeanVarLengths
.
begin
());
std
::
copy
(
scaleBiasMeanVarStrides
.
begin
(),
scaleBiasMeanVarStrides
.
end
(),
arrScaleBiasMeanVarStrides
.
begin
());
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
arrReduceDims
.
begin
());
using
PassThroughOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
// add device batchnorm-forward instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchNormFwd
<
XDataType
,
YDataType
,
AccDataType
,
ScaleDataType
,
BiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferenceBatchNormFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormFwd
<
XDataType
,
YDataType
,
AccDataType
,
ScaleDataType
,
BiasDataType
,
MeanVarDataType
,
PassThroughOp
,
Rank
,
NumBatchNormReduceDim
>
;
auto
batchNormFwd_ref
=
ReferenceBatchNormFwdInstance
{};
auto
argument_ptr_ref
=
batchNormFwd_ref
.
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x
.
mData
.
data
(),
bnScale
.
mData
.
data
(),
bnBias
.
mData
.
data
(),
epsilon
,
PassThroughOp
{},
y_ref
.
mData
.
data
(),
saveMeanAndInvVariance
?
resultSaveMean_ref
.
mData
.
data
()
:
nullptr
,
saveMeanAndInvVariance
?
resultSaveInvVariance_ref
.
mData
.
data
()
:
nullptr
,
averageFactor
,
updateMovingAverage
?
resultRunningMean_ref
.
mData
.
data
()
:
nullptr
,
updateMovingAverage
?
resultRunningVariance_ref
.
mData
.
data
()
:
nullptr
);
if
(
!
batchNormFwd_ref
.
IsSupportedArgument
(
argument_ptr_ref
.
get
()))
{
std
::
cout
<<
"The runtime parameters not supported by the reference instance, exiting!"
<<
std
::
endl
;
return
(
false
);
};
auto
invoker_ptr_ref
=
batchNormFwd_ref
.
MakeInvokerPointer
();
(
void
)
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
}
int
num_kernel
=
0
;
bool
pass
=
true
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
arrInOutLengths
,
arrInOutStrides
,
arrInOutStrides
,
arrReduceDims
,
arrScaleBiasMeanVarLengths
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
arrScaleBiasMeanVarStrides
,
x_dev
.
GetDeviceBuffer
(),
bnScale_dev
.
GetDeviceBuffer
(),
bnBias_dev
.
GetDeviceBuffer
(),
epsilon
,
PassThroughOp
{},
y_dev
.
GetDeviceBuffer
(),
saveMeanAndInvVariance
?
resultSaveMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
saveMeanAndInvVariance
?
resultSaveInvVariance_dev
.
GetDeviceBuffer
()
:
nullptr
,
averageFactor
,
updateMovingAverage
?
resultRunningMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
updateMovingAverage
?
resultRunningVariance_dev
.
GetDeviceBuffer
()
:
nullptr
);
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
num_kernel
++
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
<<
std
::
endl
;
}
continue
;
};
size_t
workspace_sz
=
inst_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_dev
(
workspace_sz
);
inst_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
size_t
num_bytes
=
0
;
// inputing of x, scale, bias, outputing of y
num_bytes
+=
total_length
*
(
sizeof
(
XDataType
)
+
sizeof
(
YDataType
))
+
invariant_length
*
(
sizeof
(
ScaleDataType
)
+
sizeof
(
BiasDataType
));
// outputing of mean, inv-variance
num_bytes
+=
saveMeanAndInvVariance
?
invariant_length
*
sizeof
(
MeanVarDataType
)
*
2
:
0
;
// updating of moving mean, variance
num_bytes
+=
updateMovingAverage
?
invariant_length
*
sizeof
(
MeanVarDataType
)
*
4
:
0
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
using
ck
::
utils
::
check_err
;
bool
single_pass
;
y_dev
.
FromDevice
(
y
.
mData
.
data
());
if
constexpr
(
ck
::
is_same_v
<
YDataType
,
ck
::
bhalf_t
>
)
single_pass
=
check_err
(
y
.
mData
,
y_ref
.
mData
,
"y results"
,
1e-2
,
1e-2
);
else
single_pass
=
check_err
(
y
.
mData
,
y_ref
.
mData
,
"y results"
,
4e-3
,
4e-3
);
if
(
updateMovingAverage
)
{
resultRunningMean_dev
.
FromDevice
(
resultRunningMean
.
mData
.
data
());
resultRunningVariance_dev
.
FromDevice
(
resultRunningVariance
.
mData
.
data
());
// clang-format off
single_pass
=
single_pass
&&
check_err
(
resultRunningMean
.
mData
,
resultRunningMean_ref
.
mData
,
"average mean results"
,
1.5e-5
,
1.5e-5
);
single_pass
=
single_pass
&&
check_err
(
resultRunningVariance
.
mData
,
resultRunningVariance_ref
.
mData
,
"average variance results"
,
1e-5
,
1e-5
);
// clang-format on
};
if
(
saveMeanAndInvVariance
)
{
resultSaveMean_dev
.
FromDevice
(
resultSaveMean
.
mData
.
data
());
resultSaveInvVariance_dev
.
FromDevice
(
resultSaveInvVariance
.
mData
.
data
());
// clang-format off
single_pass
=
single_pass
&&
check_err
(
resultSaveMean
.
mData
,
resultSaveMean_ref
.
mData
,
"mean results"
,
3e-5
,
3e-5
);
single_pass
=
single_pass
&&
check_err
(
resultSaveInvVariance
.
mData
,
resultSaveInvVariance_ref
.
mData
,
"inv-variance results"
,
7e-5
,
7e-5
);
// clang-format on
};
pass
=
pass
&&
single_pass
;
};
if
(
do_dumpout
)
{
using
ck
::
host_common
::
dumpBufferToFile
;
// clang-format off
dumpBufferToFile
(
"dump_x.bin"
,
x
.
mData
.
data
(),
x
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_y.bin"
,
y
.
mData
.
data
(),
y
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_y_ref.bin"
,
y_ref
.
mData
.
data
(),
y_ref
.
mDesc
.
GetElementSize
());
// clang-format off
if
(
saveMeanAndInvVariance
)
{
// clang-format off
dumpBufferToFile
(
"dump_mean.bin"
,
resultSaveMean
.
mData
.
data
(),
resultSaveMean
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_mean_ref.bin"
,
resultSaveMean_ref
.
mData
.
data
(),
resultSaveMean_ref
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_invvar.bin"
,
resultSaveInvVariance
.
mData
.
data
(),
resultSaveInvVariance
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_invvar_ref.bin"
,
resultSaveInvVariance_ref
.
mData
.
data
(),
resultSaveInvVariance_ref
.
mDesc
.
GetElementSize
());
// clang-format on
};
};
}
if
(
time_kernel
)
{
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_conv_bwd_data_impl.hpp
→
profiler/include/
profiler/
profile_conv_bwd_data_impl.hpp
View file @
289f15de
...
...
@@ -209,8 +209,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
{
in_device_buf
.
FromDevice
(
input_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
input_device_result
.
mData
,
input_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
input_device_result
,
input_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
→
profiler/include/
profiler/
profile_conv_fwd_bias_relu_add_impl.hpp
View file @
289f15de
...
...
@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
namespace
ck
{
...
...
@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
KYXC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
}
};
...
...
@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
// bias: assume contiguous 1d vector
Tensor
<
OutDataType
>
bias_k
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
K
)})));
Tensor
<
OutDataType
>
bias_k
({
K
});
// residual: assume same layout as output tensor
Tensor
<
OutDataType
>
resi_n_k_ho_wo
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
...
...
@@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
,
out_n_k_ho_wo_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
→
profiler/include/
profiler/
profile_conv_fwd_bias_relu_impl.hpp
View file @
289f15de
...
...
@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
namespace
ck
{
...
...
@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
KYXC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
}
};
...
...
@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
// bias: assume contiguous 1d vector
Tensor
<
OutDataType
>
bias_k
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
K
)})));
Tensor
<
OutDataType
>
bias_k
({
K
});
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
...
...
@@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
,
out_n_k_ho_wo_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_impl.hpp
→
profiler/include/
profiler/
profile_conv_fwd_impl.hpp
View file @
289f15de
...
...
@@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
,
host_output
);
if
(
do_log
)
{
...
...
profiler/include/profile_convnd_bwd_data_impl.hpp
→
profiler/include/
profiler/
profile_convnd_bwd_data_impl.hpp
View file @
289f15de
...
...
@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
success
=
ck
::
utils
::
check_err
(
input_host_result
.
mData
,
input_device_result
.
mData
);
success
=
ck
::
utils
::
check_err
(
input_host_result
,
input_device_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_convnd_bwd_weight_impl.hpp
→
profiler/include/
profiler/
profile_convnd_bwd_weight_impl.hpp
View file @
289f15de
...
...
@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
{
wei_device_buf
.
FromDevice
(
weights_device_result
.
mData
.
data
());
success
=
ck
::
utils
::
check_err
(
weights_host_result
.
mData
,
weights_device_result
.
mData
);
success
=
ck
::
utils
::
check_err
(
weights_host_result
,
weights_device_result
);
if
(
success
==
false
)
{
...
...
profiler/include/profile_elementwise_layernorm_impl.hpp
→
profiler/include/
profiler/
profile_elementwise_layernorm_impl.hpp
View file @
289f15de
...
...
@@ -13,6 +13,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace
ck
{
...
...
@@ -68,8 +69,9 @@ bool profile_elementwise_layernorm_impl(int do_verification,
std
::
vector
<
index_t
>
gammaBetaStride
=
{
0
,
1
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
using
namespace
ck
::
literals
;
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
};
Tensor
<
ADataType
>
a
(
length
);
...
...
profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
→
profiler/include/
profiler/
profile_gemm_add_add_fastgelu_impl.hpp
View file @
289f15de
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
...
...
@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
...
...
@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// run reference
if
(
do_verification
)
{
Tensor
<
AccDataType
>
c_m_n
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
M
),
static_cast
<
std
::
size_t
>
(
N
)}));
Tensor
<
AccDataType
>
c_m_n
({
M
,
N
});
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
...
...
@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
e_m_n_device_result
.
mData
,
e_m_n_host_result
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
);
}
}
else
...
...
profiler/include/profile_gemm_bias_add_reduce_impl.hpp
→
profiler/include/
profiler/
profile_gemm_bias_add_reduce_impl.hpp
View file @
289f15de
...
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
...
...
@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
int
StrideD0
)
{
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
len
}),
std
::
vector
<
std
::
size_t
>
({
stride
}));
return
HostTensorDescriptor
({
len
},
{
stride
});
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
...
...
@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
BiasDataType
>
bias_n
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
D0DataType
>
d0_m_n
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
reduce0_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
reduce1_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
reduce0_m_host_result
({
M
});
Tensor
<
ReduceDataType
>
reduce1_m_host_result
({
M
});
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
reduce0_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
reduce1_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
reduce0_m_device_result
({
M
});
Tensor
<
ReduceDataType
>
reduce1_m_device_result
({
M
});
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
...
...
@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
reduce0_device_buf
.
FromDevice
(
reduce0_m_device_result
.
mData
.
data
());
reduce1_device_buf
.
FromDevice
(
reduce1_m_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
ck
::
utils
::
check_err
(
reduce0_m_device_result
.
mData
,
reduce0_m_host_result
.
mData
);
ck
::
utils
::
check_err
(
reduce1_m_device_result
.
mData
,
reduce1_m_host_result
.
mData
);
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
ck
::
utils
::
check_err
(
reduce0_m_device_result
,
reduce0_m_host_result
);
ck
::
utils
::
check_err
(
reduce1_m_device_result
,
reduce1_m_host_result
);
if
(
do_log
)
{
...
...
Prev
1
…
11
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment