Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e00a943e
Commit
e00a943e
authored
May 17, 2022
by
myamlak
Browse files
Merge remote-tracking branch 'origin/develop' into myamlak/cgemm
parents
ffe12e2e
9f71ff48
Changes
162
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
316 additions
and
128 deletions
+316
-128
library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
...sor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
+1
-1
library/src/tensor_operation_instance/gpu/device_conv2d.cpp
library/src/tensor_operation_instance/gpu/device_conv2d.cpp
+201
-0
library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
...ary/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+1
-2
library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
.../tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
+1
-3
library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
...nsor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
+1
-3
library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
..._operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
+1
-3
library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
...tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+1
-1
library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
...y/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
+1
-3
library/src/utility/CMakeLists.txt
library/src/utility/CMakeLists.txt
+8
-8
library/src/utility/conv_util.cpp
library/src/utility/conv_util.cpp
+62
-61
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+1
-1
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+4
-3
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+6
-24
profiler/include/profile_conv_bwd_data_impl.hpp
profiler/include/profile_conv_bwd_data_impl.hpp
+3
-2
profiler/include/profile_conv_bwd_weight_impl.hpp
profiler/include/profile_conv_bwd_weight_impl.hpp
+8
-2
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+3
-2
profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
...er/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+3
-2
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+3
-2
profiler/include/profile_convnd_bwd_data_impl.hpp
profiler/include/profile_convnd_bwd_data_impl.hpp
+4
-3
profiler/include/profile_gemm_bias_2d_impl.hpp
profiler/include/profile_gemm_bias_2d_impl.hpp
+3
-2
No files found.
library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
View file @
e00a943e
...
...
@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
)
add_library
(
device_convnd_bwd_data_instance
SHARED
${
DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
}
)
add_library
(
device_convnd_bwd_data_instance
OBJECT
${
DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
}
)
target_compile_features
(
device_convnd_bwd_data_instance PUBLIC
)
set_target_properties
(
device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib
)
...
...
library/src/tensor_operation_instance/gpu/device_conv2d.cpp
0 → 100644
View file @
e00a943e
#include <stdlib.h>
#include "config.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "device_operation_instance.hpp"
#include "host_interface.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_fwd_instance
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
);
}
// namespace device_conv2d_fwd_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
struct
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
{
std
::
unique_ptr
<
DeviceConvFwdPtr_t
::
BaseArgument
>
MakeArgumentPointer
(
void
*
in_ptr
,
void
*
wei_ptr
,
void
*
out_ptr
,
size_t
N
,
size_t
K
,
size_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
const
{
return
el
->
MakeArgumentPointer
(
in_ptr
,
wei_ptr
,
out_ptr
,
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
}
std
::
unique_ptr
<
DeviceConvFwdPtr_t
::
BaseInvoker
>
MakeInvokerPointer
()
const
{
return
el
->
MakeInvokerPointer
();
}
std
::
string
GetTypeString
()
{
return
el
->
GetTypeString
();
}
bool
IsSupportedArgument
(
const
DeviceConvFwdPtr_t
::
BaseArgument
*
arg
)
{
return
el
->
IsSupportedArgument
(
arg
);
}
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
el
;
};
DeviceConvFwdPtr_t
::
DeviceConvFwdPtr_t
()
:
pImpl
(
nullptr
)
{}
DeviceConvFwdPtr_t
::~
DeviceConvFwdPtr_t
()
=
default
;
DeviceConvFwdPtr_t
::
DeviceConvFwdPtr_t
(
DeviceConvFwdPtr_t
&&
)
=
default
;
DeviceConvFwdPtr_t
::
DeviceConvFwdPtr_t
(
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
&
other
)
:
pImpl
(
std
::
make_unique
<
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
>
(
std
::
move
(
other
)))
{
}
std
::
unique_ptr
<
DeviceConvFwdPtr_t
::
BaseArgument
>
DeviceConvFwdPtr_t
::
MakeArgumentPointer
(
void
*
in_ptr
,
void
*
wei_ptr
,
void
*
out_ptr
,
size_t
N
,
size_t
K
,
size_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
const
{
return
pImpl
->
MakeArgumentPointer
(
in_ptr
,
wei_ptr
,
out_ptr
,
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
}
std
::
unique_ptr
<
DeviceConvFwdPtr_t
::
BaseInvoker
>
DeviceConvFwdPtr_t
::
MakeInvokerPointer
()
const
{
return
pImpl
->
MakeInvokerPointer
();
}
std
::
string
DeviceConvFwdPtr_t
::
GetTypeString
()
{
return
pImpl
->
GetTypeString
();
}
bool
DeviceConvFwdPtr_t
::
IsSupportedArgument
(
const
DeviceConvFwdPtr_t
::
BaseArgument
*
arg_ptr
)
{
return
pImpl
->
IsSupportedArgument
(
arg_ptr
);
}
using
namespace
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
;
void
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
)
{
std
::
vector
<
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>
local_instances
;
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
local_instances
);
for
(
auto
&
kinder
:
local_instances
)
{
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
tmp
{
std
::
move
(
kinder
)};
instances
.
emplace_back
(
tmp
);
}
return
;
}
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
)
{
std
::
vector
<
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>
local_instances
;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
local_instances
);
for
(
auto
&
kinder
:
local_instances
)
{
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
tmp
{
std
::
move
(
kinder
)};
instances
.
emplace_back
(
tmp
);
// Perhaps we can do better
}
return
;
}
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
)
{
std
::
vector
<
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>
local_instances
;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
local_instances
);
for
(
auto
&
kinder
:
local_instances
)
{
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
tmp
{
std
::
move
(
kinder
)};
instances
.
emplace_back
(
tmp
);
// Perhaps we can do better
}
return
;
}
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
)
{
std
::
vector
<
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>
local_instances
;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
local_instances
);
for
(
auto
&
kinder
:
local_instances
)
{
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
tmp
{
std
::
move
(
kinder
)};
instances
.
emplace_back
(
tmp
);
// Perhaps we can do better
}
return
;
}
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t
(
std
::
vector
<
DeviceConvFwdPtr_t
>&
instances
)
{
std
::
vector
<
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>
local_instances
;
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
local_instances
);
for
(
auto
&
kinder
:
local_instances
)
{
DeviceConvFwdPtr_t
::
DeviceConvFwdPtrImpl
tmp
{
std
::
move
(
kinder
)};
instances
.
emplace_back
(
tmp
);
}
return
;
}
library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
View file @
e00a943e
...
...
@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
)
add_library
(
device_gemm_instance
SHARED
${
DEVICE_GEMM_INSTANCE_SOURCE
}
)
add_library
(
device_gemm_instance
OBJECT
${
DEVICE_GEMM_INSTANCE_SOURCE
}
)
target_compile_features
(
device_gemm_instance PUBLIC
)
set_target_properties
(
device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device_gemm_instance LIBRARY DESTINATION lib
)
clang_tidy_check
(
device_gemm_instance
)
library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
View file @
e00a943e
...
...
@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
)
add_library
(
device_gemm_bias2d_instance SHARED
${
DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
}
)
target_compile_features
(
device_gemm_bias2d_instance PUBLIC
)
add_library
(
device_gemm_bias2d_instance OBJECT
${
DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
}
)
set_target_properties
(
device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib
)
clang_tidy_check
(
device_gemm_bias2d_instance
)
library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
View file @
e00a943e
...
...
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
)
add_library
(
device_gemm_bias_relu_instance SHARED
${
DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
}
)
target_compile_features
(
device_gemm_bias_relu_instance PUBLIC
)
add_library
(
device_gemm_bias_relu_instance OBJECT
${
DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
}
)
set_target_properties
(
device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib
)
clang_tidy_check
(
device_gemm_bias_relu_instance
)
library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
View file @
e00a943e
...
...
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
)
add_library
(
device_gemm_bias_relu_add_instance SHARED
${
DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
}
)
target_compile_features
(
device_gemm_bias_relu_add_instance PUBLIC
)
add_library
(
device_gemm_bias_relu_add_instance OBJECT
${
DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
}
)
set_target_properties
(
device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib
)
clang_tidy_check
(
device_gemm_bias_relu_add_instance
)
library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
View file @
e00a943e
...
...
@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
)
add_library
(
device_grouped_gemm_instance
SHARED
${
DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
}
)
add_library
(
device_grouped_gemm_instance
OBJECT
${
DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
}
)
target_compile_features
(
device_grouped_gemm_instance PUBLIC
)
set_target_properties
(
device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
...
...
library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
View file @
e00a943e
...
...
@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
)
add_library
(
device_reduce_instance SHARED
${
DEVICE_REDUCE_INSTANCE_SOURCE
}
)
target_compile_features
(
device_reduce_instance PUBLIC
)
add_library
(
device_reduce_instance OBJECT
${
DEVICE_REDUCE_INSTANCE_SOURCE
}
)
set_target_properties
(
device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device_reduce_instance LIBRARY DESTINATION lib
)
clang_tidy_check
(
device_reduce_instance
)
library/src/utility/CMakeLists.txt
View file @
e00a943e
...
...
@@ -8,14 +8,14 @@ include_directories(BEFORE
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
)
set
(
CONV_
FWD_
UTIL_SOURCE
conv_
fwd_
util.cpp
set
(
CONV_UTIL_SOURCE
conv_util.cpp
)
add_library
(
conv_
fwd_
util SHARED
${
CONV_
FWD_
UTIL_SOURCE
}
)
target_link_libraries
(
conv_
fwd_
util PRIVATE host_tensor
)
target_compile_features
(
conv_
fwd_
util PUBLIC
)
set_target_properties
(
conv_
fwd_
util PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
conv_
fwd_
util SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
add_library
(
conv_util SHARED
${
CONV_UTIL_SOURCE
}
)
target_link_libraries
(
conv_util PRIVATE host_tensor
)
target_compile_features
(
conv_util PUBLIC
)
set_target_properties
(
conv_util PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
conv_util SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
clang_tidy_check
(
conv_
fwd_
util
)
clang_tidy_check
(
conv_util
)
library/src/utility/conv_
fwd_
util.cpp
→
library/src/utility/conv_util.cpp
View file @
e00a943e
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
namespace
ck
{
namespace
utils
{
...
...
@@ -37,16 +37,16 @@ std::size_t get_flops(ck::index_t N,
}
ConvParams
::
ConvParams
()
:
num_dim_spatial
(
2
),
N
(
128
),
K
(
256
),
C
(
192
),
filter_spatial_lengths
(
2
,
3
),
input_spatial_lengths
(
2
,
71
),
conv_filter_strides
(
2
,
2
),
conv_filter_dilations
(
2
,
1
),
input_left_pads
(
2
,
1
),
input_right_pads
(
2
,
1
)
:
num_dim_spatial
_
(
2
),
N
_
(
128
),
K
_
(
256
),
C
_
(
192
),
filter_spatial_lengths
_
(
2
,
3
),
input_spatial_lengths
_
(
2
,
71
),
conv_filter_strides
_
(
2
,
2
),
conv_filter_dilations
_
(
2
,
1
),
input_left_pads
_
(
2
,
1
),
input_right_pads
_
(
2
,
1
)
{
}
...
...
@@ -60,23 +60,23 @@ ConvParams::ConvParams(ck::index_t n_dim,
const
std
::
vector
<
ck
::
index_t
>&
dilations
,
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
right_pads
)
:
num_dim_spatial
(
n_dim
),
N
(
n_batch
),
K
(
n_out_channels
),
C
(
n_in_channels
),
filter_spatial_lengths
(
filters_len
),
input_spatial_lengths
(
input_len
),
conv_filter_strides
(
strides
),
conv_filter_dilations
(
dilations
),
input_left_pads
(
left_pads
),
input_right_pads
(
right_pads
)
:
num_dim_spatial
_
(
n_dim
),
N
_
(
n_batch
),
K
_
(
n_out_channels
),
C
_
(
n_in_channels
),
filter_spatial_lengths
_
(
filters_len
),
input_spatial_lengths
_
(
input_len
),
conv_filter_strides
_
(
strides
),
conv_filter_dilations
_
(
dilations
),
input_left_pads
_
(
left_pads
),
input_right_pads
_
(
right_pads
)
{
if
(
ck
::
type_convert
<
ck
::
index_t
>
(
filter_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_strides
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_dilations
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_left_pads
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_right_pads
.
size
())
!=
num_dim_spatial
)
if
(
ck
::
type_convert
<
ck
::
index_t
>
(
filter_spatial_lengths
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_spatial_lengths
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_strides
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_dilations
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_left_pads
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_right_pads
_
.
size
())
!=
num_dim_spatial
_
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
...
...
@@ -86,27 +86,28 @@ ConvParams::ConvParams(ck::index_t n_dim,
std
::
vector
<
ck
::
index_t
>
ConvParams
::
GetOutputSpatialLengths
()
const
{
if
(
ck
::
type_convert
<
ck
::
index_t
>
(
filter_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_strides
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_dilations
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_left_pads
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_right_pads
.
size
())
!=
num_dim_spatial
)
if
(
ck
::
type_convert
<
ck
::
index_t
>
(
filter_spatial_lengths
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_spatial_lengths
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_strides
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_dilations
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_left_pads
_
.
size
())
!=
num_dim_spatial
_
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_right_pads
_
.
size
())
!=
num_dim_spatial
_
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
_
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
_
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
idx_eff
=
(
filter_spatial_lengths
[
i
]
-
1
)
*
conv_filter_dilations
[
i
]
+
1
;
const
ck
::
index_t
idx_eff
=
(
filter_spatial_lengths_
[
i
]
-
1
)
*
conv_filter_dilations_
[
i
]
+
1
;
out_spatial_len
[
i
]
=
(
input_spatial_lengths
[
i
]
+
input_left_pads
[
i
]
+
input_right_pads
[
i
]
-
idx_eff
)
/
conv_filter_strides
[
i
]
+
(
input_spatial_lengths
_
[
i
]
+
input_left_pads
_
[
i
]
+
input_right_pads
_
[
i
]
-
idx_eff
)
/
conv_filter_strides
_
[
i
]
+
1
;
}
return
out_spatial_len
;
...
...
@@ -116,40 +117,40 @@ ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[
{
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
...
...
@@ -228,12 +229,12 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::siz
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
ck
::
utils
::
conv
::
ConvParams
&
p
)
{
os
<<
"ConvParams {"
<<
"
\n
num_dim_spatial: "
<<
p
.
num_dim_spatial
<<
"
\n
N: "
<<
p
.
N
<<
"
\n
K: "
<<
p
.
K
<<
"
\n
C: "
<<
p
.
C
<<
"
\n
filter_spatial_lengths: "
<<
p
.
filter_spatial_lengths
<<
"
\n
input_spatial_lengths: "
<<
p
.
input_spatial_lengths
<<
"
\n
conv_filter_strides: "
<<
p
.
conv_filter_strides
<<
"
\n
conv_filter_dilations: "
<<
p
.
conv_filter_dilations
<<
"
\n
input_left_pads: "
<<
p
.
input_left_pads
<<
"
\n
input_right_pads: "
<<
p
.
input_right_pads
;
<<
"
\n
num_dim_spatial: "
<<
p
.
num_dim_spatial
_
<<
"
\n
N: "
<<
p
.
N
_
<<
"
\n
K: "
<<
p
.
K
_
<<
"
\n
C: "
<<
p
.
C
_
<<
"
\n
filter_spatial_lengths: "
<<
p
.
filter_spatial_lengths
_
<<
"
\n
input_spatial_lengths: "
<<
p
.
input_spatial_lengths
_
<<
"
\n
conv_filter_strides: "
<<
p
.
conv_filter_strides
_
<<
"
\n
conv_filter_dilations: "
<<
p
.
conv_filter_dilations
_
<<
"
\n
input_left_pads: "
<<
p
.
input_left_pads
_
<<
"
\n
input_right_pads: "
<<
p
.
input_right_pads
_
;
return
os
;
}
profiler/CMakeLists.txt
View file @
e00a943e
...
...
@@ -43,7 +43,7 @@ set(PROFILER_SOURCE
add_executable
(
ckProfiler
${
PROFILER_SOURCE
}
)
target_link_libraries
(
ckProfiler PRIVATE host_tensor
)
target_link_libraries
(
ckProfiler PRIVATE conv_
fwd_
util
)
target_link_libraries
(
ckProfiler PRIVATE conv_util
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias2d_instance
)
...
...
profiler/include/profile_batched_gemm_impl.hpp
View file @
e00a943e
...
...
@@ -63,7 +63,7 @@ template <typename ADataType,
bool
profile_batched_gemm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
M
,
int
N
,
int
K
,
...
...
@@ -356,11 +356,12 @@ bool profile_batched_gemm_impl(int do_verification,
{
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
BatchCount
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
(
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
M
+
std
::
size_t
num_btype
=
(
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
)
*
BatchCount
;
...
...
profiler/include/profile_batched_gemm_reduce_impl.hpp
View file @
e00a943e
...
...
@@ -53,7 +53,7 @@ template <typename ADataType,
bool
profile_batched_gemm_reduce_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
M
,
int
N
,
int
K
,
...
...
@@ -258,31 +258,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
auto
invoker_ptr
=
gemm_ptr
->
MakeInvokerPointer
();
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
// warm up
invoker_ptr
->
Run
(
argument_ptr
.
get
());
// timing
float
total_time
=
0
;
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
KernelTimer
timer
;
timer
.
Start
();
invoker_ptr
->
Run
(
argument_ptr
.
get
());
timer
.
End
();
total_time
+=
timer
.
GetElapsedTime
();
}
float
ave_time
=
total_time
/
nrepeat
;
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
...
...
profiler/include/profile_conv_bwd_data_impl.hpp
View file @
e00a943e
...
...
@@ -51,7 +51,7 @@ template <int NDimSpatial,
void
profile_conv_bwd_data_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
...
...
@@ -228,7 +228,8 @@ void profile_conv_bwd_data_impl(int do_verification,
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamControl
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
profiler/include/profile_conv_bwd_weight_impl.hpp
View file @
e00a943e
#pragma once
#include "stream_config.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -43,7 +45,7 @@ template <int NDimSpatial,
bool
profile_conv_bwd_weight_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
...
...
@@ -182,6 +184,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
// profile device Conv instances
bool
pass
=
true
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
// using atomic, so need to reset input
...
...
@@ -189,6 +192,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{
wei_device_buf
.
SetZero
();
}
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
...
...
@@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
@@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
wei_device_buf
.
FromDevice
(
wei_k_c_y_x_device_result
.
mData
.
data
());
float
max_error
=
check_error
(
wei_k_c_y_x_host_result
,
wei_k_c_y_x_device_result
);
if
(
max_error
>
8
)
{
pass
=
false
;
...
...
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
View file @
e00a943e
...
...
@@ -42,7 +42,7 @@ template <int NDimSpatial,
void
profile_conv_fwd_bias_relu_add_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
...
...
@@ -219,7 +219,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{
std
::
string
conv_name
=
op_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
View file @
e00a943e
...
...
@@ -119,7 +119,7 @@ template <int NDimSpatial,
void
profile_conv_fwd_bias_relu_atomic_add_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
...
...
@@ -275,7 +275,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
{
std
::
string
conv_name
=
op_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
View file @
e00a943e
...
...
@@ -41,7 +41,7 @@ template <int NDimSpatial,
void
profile_conv_fwd_bias_relu_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
...
...
@@ -207,7 +207,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{
std
::
string
conv_name
=
op_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
profiler/include/profile_convnd_bwd_data_impl.hpp
View file @
e00a943e
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
...
...
@@ -269,7 +269,7 @@ template <int NDimSpatial,
bool
profile_convnd_bwd_data_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
...
...
@@ -410,7 +410,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
N
,
C
,
K
,
filter_spatial_lengths
,
output_spatial_lengths
);
...
...
profiler/include/profile_gemm_bias_2d_impl.hpp
View file @
e00a943e
...
...
@@ -65,7 +65,7 @@ template <typename ADataType,
void
profile_gemm_bias_2d_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
M
,
int
N
,
int
K
,
...
...
@@ -259,7 +259,8 @@ void profile_gemm_bias_2d_impl(int do_verification,
{
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
...
...
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment