Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8c03672b
Commit
8c03672b
authored
Apr 28, 2022
by
myamlak
Browse files
Merge remote-tracking branch 'origin/develop' into myamlak/issue153-sign-compare
parents
1eb7d83b
3956085d
Changes
38
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1064 additions
and
355 deletions
+1064
-355
Dockerfile
Dockerfile
+2
-7
Jenkinsfile
Jenkinsfile
+1
-1
example/06_conv2d_fwd_bias_relu/CMakeLists.txt
example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+1
-0
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+1
-0
example/09_convnd_fwd/CMakeLists.txt
example/09_convnd_fwd/CMakeLists.txt
+3
-0
example/10_conv2d_bwd_data/CMakeLists.txt
example/10_conv2d_bwd_data/CMakeLists.txt
+1
-0
example/11_conv2d_bwd_weight/CMakeLists.txt
example/11_conv2d_bwd_weight/CMakeLists.txt
+1
-0
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+7
-2
example/17_convnd_bwd_data_xdl/CMakeLists.txt
example/17_convnd_bwd_data_xdl/CMakeLists.txt
+1
-0
include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+45
-20
include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
...on/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+3
-0
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+6
-3
include/ck/utility/transpose_vectors.hpp
include/ck/utility/transpose_vectors.hpp
+82
-1
library/CMakeLists.txt
library/CMakeLists.txt
+1
-0
library/include/ck/library/utility/conv_fwd_util.hpp
library/include/ck/library/utility/conv_fwd_util.hpp
+323
-308
library/include/ck/library/utility/fill.hpp
library/include/ck/library/utility/fill.hpp
+81
-0
library/include/ck/library/utility/op_instance_engine.hpp
library/include/ck/library/utility/op_instance_engine.hpp
+231
-0
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
...vice_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
+13
-13
library/src/utility/CMakeLists.txt
library/src/utility/CMakeLists.txt
+21
-0
library/src/utility/conv_fwd_util.cpp
library/src/utility/conv_fwd_util.cpp
+240
-0
No files found.
Dockerfile
View file @
8c03672b
...
@@ -42,7 +42,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
...
@@ -42,7 +42,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
libnuma-dev
\
libnuma-dev
\
libpthread-stubs0-dev
\
libpthread-stubs0-dev
\
llvm-amdgpu
\
llvm-amdgpu
\
miopengemm
\
pkg-config
\
pkg-config
\
python
\
python
\
python3
\
python3
\
...
@@ -51,19 +50,15 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
...
@@ -51,19 +50,15 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
python-pip
\
python-pip
\
python3-pip
\
python3-pip
\
software-properties-common
\
software-properties-common
\
sqlite3
\
wget
\
wget
\
rocm-dev
\
rocm-dev
\
rocm-device-libs
\
rocm-device-libs
\
rocm-opencl
\
rocm-opencl-dev
\
rocm-cmake
\
rocm-cmake
\
rocblas
\
vim
\
vim
\
zlib1g-dev
\
zlib1g-dev
\
openssh-server
\
openssh-server
\
kmod
\
clang-format-10
\
mysql-client
&&
\
kmod
&&
\
apt-get clean
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
rm
-rf
/var/lib/apt/lists/
*
...
...
Jenkinsfile
View file @
8c03672b
...
@@ -204,7 +204,7 @@ pipeline {
...
@@ -204,7 +204,7 @@ pipeline {
stage
(
'Clang Format'
)
{
stage
(
'Clang Format'
)
{
agent
{
label
rocmnode
(
"nogpu"
)
}
agent
{
label
rocmnode
(
"nogpu"
)
}
environment
{
environment
{
execute_cmd
=
"find . -iname \'*.h\' \
execute_cmd
=
"find .
.
-iname \'*.h\' \
-o -iname \'*.hpp\' \
-o -iname \'*.hpp\' \
-o -iname \'*.cpp\' \
-o -iname \'*.cpp\' \
-o -iname \'*.h.in\' \
-o -iname \'*.h.in\' \
...
...
example/06_conv2d_fwd_bias_relu/CMakeLists.txt
View file @
8c03672b
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp
)
target_link_libraries
(
example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util
)
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
View file @
8c03672b
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp
)
target_link_libraries
(
example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util
)
example/09_convnd_fwd/CMakeLists.txt
View file @
8c03672b
add_example_executable
(
example_convnd_fwd_xdl convnd_fwd_xdl.cpp
)
add_example_executable
(
example_convnd_fwd_xdl convnd_fwd_xdl.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl PRIVATE conv_fwd_util
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util
)
example/10_conv2d_bwd_data/CMakeLists.txt
View file @
8c03672b
add_example_executable
(
example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp
)
add_example_executable
(
example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util
)
example/11_conv2d_bwd_weight/CMakeLists.txt
View file @
8c03672b
add_example_executable
(
example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp
)
add_example_executable
(
example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util
)
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
View file @
8c03672b
...
@@ -72,8 +72,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
...
@@ -72,8 +72,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
8
>
;
// CBlockTransferScalarPerVector_NWaveNPerXdl
8
>
;
// CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
// clang-format on
using
ReferenceConvBwdWeightInstance
=
ck
::
tensor_operation
::
host
::
using
ReferenceConvBwdWeightInstance
=
ReferenceConvBwdWeight
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
ck
::
tensor_operation
::
host
::
ReferenceConvBwdWeight
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
...
...
example/17_convnd_bwd_data_xdl/CMakeLists.txt
View file @
8c03672b
add_example_executable
(
example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp
)
add_example_executable
(
example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp
)
target_link_libraries
(
example_convnd_bwd_data_xdl PRIVATE conv_fwd_util
)
include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
View file @
8c03672b
...
@@ -16,6 +16,31 @@ namespace ck {
...
@@ -16,6 +16,31 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
/*
* \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
*
* \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
* given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
* strided batched, but we can easily extend to other layouts. The returned offset can be either \p
* index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
* limitations.
*
* \tparam Block2CTileMap Block2CTileMap::CalculateBottomIndex() takes in id of a workgroup and
* returns the 2D index of the tile that it computes. \see
* GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
*
* \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
* tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
* descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
* device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
* DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
* pointer offset into \p ComputePtrOffsetOfStridedBatch.
*
* \note \p Block2CTileMap allows customized mapping between a workgroup and the C-tile it computes.
* Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
* realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
*
*/
template
<
typename
GridwiseGemm
,
template
<
typename
GridwiseGemm
,
typename
FloatAB
,
typename
FloatAB
,
typename
FloatC
,
typename
FloatC
,
...
@@ -25,7 +50,7 @@ template <typename GridwiseGemm,
...
@@ -25,7 +50,7 @@ template <typename GridwiseGemm,
typename
AElementwiseOperation
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
typename
CElementwiseOperation
,
typename
Compute
BasePr
tOfBatch
,
typename
Compute
PtrOffse
tOfBatch
,
typename
Block2CTileMap
,
typename
Block2CTileMap
,
bool
HasMainKBlockLoop
>
bool
HasMainKBlockLoop
>
__global__
void
__global__
void
...
@@ -43,7 +68,7 @@ __global__ void
...
@@ -43,7 +68,7 @@ __global__ void
const
AElementwiseOperation
a_element_op
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CElementwiseOperation
c_element_op
,
const
CElementwiseOperation
c_element_op
,
const
Compute
BasePr
tOfBatch
compute_
base_ptr
_of_batch
_
,
const
Compute
PtrOffse
tOfBatch
compute_
ptr_offset
_of_batch
,
const
Block2CTileMap
block_2_ctile_map
)
const
Block2CTileMap
block_2_ctile_map
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
...
@@ -52,11 +77,11 @@ __global__ void
...
@@ -52,11 +77,11 @@ __global__ void
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
__builtin_amdgcn_readfirstlane
(
const
long_index_t
a_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_
base_ptr
_of_batch
_
.
GetA
BasePtr
(
g_idx
)));
static_cast
<
long_index_t
>
(
compute_
ptr_offset
_of_batch
.
GetA
PtrOffset
(
g_idx
)));
const
long_index_t
b_batch_offset
=
__builtin_amdgcn_readfirstlane
(
const
long_index_t
b_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_
base_ptr
_of_batch
_
.
GetB
BasePtr
(
g_idx
)));
static_cast
<
long_index_t
>
(
compute_
ptr_offset
_of_batch
.
GetB
PtrOffset
(
g_idx
)));
const
long_index_t
c_batch_offset
=
__builtin_amdgcn_readfirstlane
(
const
long_index_t
c_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_
base_ptr
_of_batch
_
.
GetC
BasePtr
(
g_idx
)));
static_cast
<
long_index_t
>
(
compute_
ptr_offset
_of_batch
.
GetC
PtrOffset
(
g_idx
)));
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
...
@@ -256,26 +281,26 @@ struct DeviceBatchedGemmXdl
...
@@ -256,26 +281,26 @@ struct DeviceBatchedGemmXdl
return
globalblockid_to_m0_n0_block_cluster_adaptor
;
return
globalblockid_to_m0_n0_block_cluster_adaptor
;
}
}
struct
Compute
BasePtr
OfStridedBatch
struct
Compute
PtrOffset
OfStridedBatch
{
{
Compute
BasePtr
OfStridedBatch
(
index_t
BatchStrideA
,
Compute
PtrOffset
OfStridedBatch
(
index_t
BatchStrideA
,
index_t
BatchStrideB
,
index_t
BatchStrideB
,
index_t
BatchStrideC
)
index_t
BatchStrideC
)
:
BatchStrideA_
(
BatchStrideA
),
BatchStrideB_
(
BatchStrideB
),
BatchStrideC_
(
BatchStrideC
)
:
BatchStrideA_
(
BatchStrideA
),
BatchStrideB_
(
BatchStrideB
),
BatchStrideC_
(
BatchStrideC
)
{
{
}
}
__host__
__device__
constexpr
long_index_t
GetA
BasePtr
(
index_t
g_idx
)
const
__host__
__device__
constexpr
long_index_t
GetA
PtrOffset
(
index_t
g_idx
)
const
{
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideA_
);
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideA_
);
}
}
__host__
__device__
constexpr
long_index_t
GetB
BasePtr
(
index_t
g_idx
)
const
__host__
__device__
constexpr
long_index_t
GetB
PtrOffset
(
index_t
g_idx
)
const
{
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideB_
);
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideB_
);
}
}
__host__
__device__
constexpr
long_index_t
GetC
BasePtr
(
index_t
g_idx
)
const
__host__
__device__
constexpr
long_index_t
GetC
PtrOffset
(
index_t
g_idx
)
const
{
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideC_
);
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideC_
);
}
}
...
@@ -359,9 +384,9 @@ struct DeviceBatchedGemmXdl
...
@@ -359,9 +384,9 @@ struct DeviceBatchedGemmXdl
DeviceBatchedGemmXdl
::
MakeBGridDescriptor_K0_N_K1
(
K
,
N
,
StrideB
)},
DeviceBatchedGemmXdl
::
MakeBGridDescriptor_K0_N_K1
(
K
,
N
,
StrideB
)},
c_grid_desc_m_n_
{
DeviceBatchedGemmXdl
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
)},
c_grid_desc_m_n_
{
DeviceBatchedGemmXdl
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
)},
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_
{},
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_
{},
compute_
base_ptr
_of_batch_
{
a_grid_desc_k0_m_k1_
.
GetElementSpaceSize
(),
compute_
ptr_offset
_of_batch_
{
a_grid_desc_k0_m_k1_
.
GetElementSpaceSize
(),
b_grid_desc_k0_n_k1_
.
GetElementSpaceSize
(),
b_grid_desc_k0_n_k1_
.
GetElementSpaceSize
(),
c_grid_desc_m_n_
.
GetElementSpaceSize
()},
c_grid_desc_m_n_
.
GetElementSpaceSize
()},
block_2_ctile_map_
{},
block_2_ctile_map_
{},
M01_
{
M01
},
M01_
{
M01
},
N01_
{
N01
},
N01_
{
N01
},
...
@@ -388,7 +413,7 @@ struct DeviceBatchedGemmXdl
...
@@ -388,7 +413,7 @@ struct DeviceBatchedGemmXdl
BGridDesc_K0_N_K1
b_grid_desc_k0_n_k1_
;
BGridDesc_K0_N_K1
b_grid_desc_k0_n_k1_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_
;
CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_
;
Compute
BasePtr
OfStridedBatch
compute_
base_ptr
_of_batch_
;
Compute
PtrOffset
OfStridedBatch
compute_
ptr_offset
_of_batch_
;
Block2CTileMap
block_2_ctile_map_
;
Block2CTileMap
block_2_ctile_map_
;
index_t
M01_
;
index_t
M01_
;
index_t
N01_
;
index_t
N01_
;
...
@@ -448,7 +473,7 @@ struct DeviceBatchedGemmXdl
...
@@ -448,7 +473,7 @@ struct DeviceBatchedGemmXdl
AElementwiseOperation
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
CElementwiseOperation
,
Compute
BasePtr
OfStridedBatch
,
Compute
PtrOffset
OfStridedBatch
,
remove_reference_t
<
Block2CTileMap
>
,
remove_reference_t
<
Block2CTileMap
>
,
true
>
;
true
>
;
...
@@ -467,7 +492,7 @@ struct DeviceBatchedGemmXdl
...
@@ -467,7 +492,7 @@ struct DeviceBatchedGemmXdl
arg
.
a_element_op_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
c_element_op_
,
arg
.
compute_
base_ptr
_of_batch_
,
arg
.
compute_
ptr_offset
_of_batch_
,
arg
.
block_2_ctile_map_
);
arg
.
block_2_ctile_map_
);
}
}
else
else
...
@@ -482,7 +507,7 @@ struct DeviceBatchedGemmXdl
...
@@ -482,7 +507,7 @@ struct DeviceBatchedGemmXdl
AElementwiseOperation
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
CElementwiseOperation
,
Compute
BasePtr
OfStridedBatch
,
Compute
PtrOffset
OfStridedBatch
,
remove_reference_t
<
Block2CTileMap
>
,
remove_reference_t
<
Block2CTileMap
>
,
false
>
;
false
>
;
...
@@ -501,7 +526,7 @@ struct DeviceBatchedGemmXdl
...
@@ -501,7 +526,7 @@ struct DeviceBatchedGemmXdl
arg
.
a_element_op_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
c_element_op_
,
arg
.
compute_
base_ptr
_of_batch_
,
arg
.
compute_
ptr_offset
_of_batch_
,
arg
.
block_2_ctile_map_
);
arg
.
block_2_ctile_map_
);
}
}
...
...
include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
View file @
8c03672b
...
@@ -18,6 +18,9 @@ namespace ck {
...
@@ -18,6 +18,9 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
/*
* \see \link device_batched_gemm_xdl.hpp kernel_batched_gemm_xdlops_v2r3() \endlink.
*/
template
<
typename
GridwiseGemm
,
template
<
typename
GridwiseGemm
,
typename
FloatAB
,
typename
FloatAB
,
typename
FloatC
,
typename
FloatC
,
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
8c03672b
...
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// TODO make this logic more generic for more sub-dword datatype
// TODO make this logic more generic for more sub-dword datatype
if
constexpr
(
SrcVectorDim
!=
DstVectorDim
&&
if
constexpr
(
SrcVectorDim
!=
DstVectorDim
&&
is_same
<
half_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
((
is_same
<
half_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
is_same
<
half_t
,
remove_cvref_t
<
DstData
>>::
value
&&
is_same
<
half_t
,
remove_cvref_t
<
DstData
>>::
value
&&
SrcScalarPerVector
%
2
==
0
&&
DstScalarPerVector
%
2
==
0
)
SrcScalarPerVector
%
2
==
0
&&
DstScalarPerVector
%
2
==
0
)
||
(
is_same
<
int8_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
is_same
<
int8_t
,
remove_cvref_t
<
DstData
>>::
value
&&
SrcScalarPerVector
%
4
==
0
&&
DstScalarPerVector
%
4
==
0
)))
{
{
// each transpose does
// each transpose does
// DstScalarPerVector # of src vectors in src_thread_scratch_
// DstScalarPerVector # of src vectors in src_thread_scratch_
...
...
include/ck/utility/transpose_vectors.hpp
View file @
8c03672b
...
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
...
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
template
<
index_t
NX
,
index_t
NY
>
template
<
index_t
NX
,
index_t
NY
>
struct
transpose_vectors
<
half_t
,
NX
,
NY
>
struct
transpose_vectors
<
half_t
,
NX
,
NY
>
{
{
// we got [NY * NX] am
m
ount of S data to be transposed
// we got [NY * NX] amount of S data to be transposed
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_y
=
NX
;
static
constexpr
index_t
s_per_y
=
NX
;
...
@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
...
@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
}
}
};
};
// transpose int8 4x4
__device__
void
transpose_int8_4x4
(
const
int8x4_t
&
x0
,
const
int8x4_t
&
x1
,
const
int8x4_t
&
x2
,
const
int8x4_t
&
x3
,
int8x4_t
&
y0
,
int8x4_t
&
y1
,
int8x4_t
&
y2
,
int8x4_t
&
y3
)
{
int32_t
t0
,
t1
;
int32_t
z0
,
z1
,
z2
,
z3
;
constexpr
int32_t
m0
=
0x05010400
;
constexpr
int32_t
m1
=
0x05040100
;
constexpr
int32_t
m2
=
0x07060302
;
constexpr
int32_t
m3
=
0x07030602
;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits first)
// clang-format off
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t0
)
:
"v"
(
bit_cast
<
int32_t
>
(
x1
)),
"v"
(
bit_cast
<
int32_t
>
(
x0
)),
"s"
(
m0
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t1
)
:
"v"
(
bit_cast
<
int32_t
>
(
x3
)),
"v"
(
bit_cast
<
int32_t
>
(
x2
)),
"s"
(
m0
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z0
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m1
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z1
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m2
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t0
)
:
"v"
(
bit_cast
<
int32_t
>
(
x1
)),
"v"
(
bit_cast
<
int32_t
>
(
x0
)),
"s"
(
m3
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t1
)
:
"v"
(
bit_cast
<
int32_t
>
(
x3
)),
"v"
(
bit_cast
<
int32_t
>
(
x2
)),
"s"
(
m3
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z2
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m1
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z3
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m2
));
// clang-format on
y0
=
bit_cast
<
int8x4_t
>
(
z0
);
y1
=
bit_cast
<
int8x4_t
>
(
z1
);
y2
=
bit_cast
<
int8x4_t
>
(
z2
);
y3
=
bit_cast
<
int8x4_t
>
(
z3
);
}
template
<
index_t
NX
,
index_t
NY
>
struct
transpose_vectors
<
int8_t
,
NX
,
NY
>
{
// we got [NY * NX] amount of S data to be transposed
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_y
=
NX
;
using
S
=
int8_t
;
using
VX
=
vector_type
<
int8_t
,
s_per_x
>
;
using
VY
=
vector_type
<
int8_t
,
s_per_y
>
;
__device__
void
operator
()(
const
StaticallyIndexedArray
<
const
VX
&
,
NX
>&
vx_tuple
,
StaticallyIndexedArray
<
VY
&
,
NY
>&
vy_tuple
)
{
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static_assert
((
NX
%
4
==
0
&&
NY
%
4
==
0
),
"wrong!"
);
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for
<
0
,
NY
,
4
>
{}([
&
](
auto
iy
)
{
static_for
<
0
,
NX
,
4
>
{}([
&
](
auto
ix
)
{
// reference to 4 int8 data from vx_tuple
const
auto
&
x_s4_0
=
vx_tuple
[
ix
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_1
=
vx_tuple
[
ix
+
I1
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_2
=
vx_tuple
[
ix
+
I2
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_3
=
vx_tuple
[
ix
+
I3
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
// reference to 4 int8 data from vy_tuple
auto
&
y_s4_0
=
vy_tuple
(
iy
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_1
=
vy_tuple
(
iy
+
I1
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_2
=
vy_tuple
(
iy
+
I2
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_3
=
vy_tuple
(
iy
+
I3
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
// transpose
transpose_int8_4x4
(
x_s4_0
,
x_s4_1
,
x_s4_2
,
x_s4_3
,
y_s4_0
,
y_s4_1
,
y_s4_2
,
y_s4_3
);
});
});
}
};
}
// namespace ck
}
// namespace ck
#endif
#endif
library/CMakeLists.txt
View file @
8c03672b
add_subdirectory
(
src/host_tensor
)
add_subdirectory
(
src/host_tensor
)
add_subdirectory
(
src/tensor_operation_instance/gpu
)
add_subdirectory
(
src/tensor_operation_instance/gpu
)
add_subdirectory
(
src/utility
)
library/include/ck/library/utility/conv_fwd_util.hpp
View file @
8c03672b
This diff is collapsed.
Click to expand it.
library/include/ck/library/utility/fill.hpp
0 → 100644
View file @
8c03672b
#pragma once
#include <algorithm>
#include <random>
#include "data_type.hpp"
namespace
ck
{
namespace
utils
{
// template <typename T, class Enable = void>
// struct FillUniform;
// TODO: what's wrong with this specialization???
// err: segmentation fault in mt19937 - infinite loop like.
// template <typename T>
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
// !std::is_same<T, bhalf_t>::value>::type>
// {
// int a_{0};
// int b_{5};
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen{11939};
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// std::is_same<T, bhalf_t>::value>::type>
template
<
typename
T
>
struct
FillUniform
{
float
a_
{
0
};
float
b_
{
5
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
{
11939
};
std
::
uniform_real_distribution
<>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
}
};
template
<
typename
T
>
struct
FillMonotonicSeq
{
T
init_value_
{
0
};
T
step_
{
1
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
generate
(
first
,
last
,
[
=
,
n
=
init_value_
]()
mutable
{
auto
tmp
=
n
;
n
+=
step_
;
return
tmp
;
});
}
};
template
<
typename
T
>
struct
FillConstant
{
T
value_
{
0
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
fill
(
first
,
last
,
value_
);
}
};
}
// namespace utils
}
// namespace ck
library/include/ck/library/utility/op_instance_engine.hpp
0 → 100644
View file @
8c03672b
#pragma once
#include <cstdlib>
#include <limits>
#include <memory>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>
#include "check_err.hpp"
#include "device_base.hpp"
#include "functional2.hpp"
namespace
ck
{
namespace
utils
{
struct
ProfileBestConfig
{
std
::
string
best_op_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_tflops
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
std
::
numeric_limits
<
float
>::
max
();
};
/**
* @brief This class describes an operation instance(s).
*
* Op instance defines a particular specializations of operator
* template. Thanks to this specific input/output data types, data
* layouts and modifying elementwise operations it is able to create
* it's input/output tensors, provide pointers to instances which
* can execute it and all operation specific parameters.
*/
template
<
typename
OutDataType
,
typename
...
InArgTypes
>
class
OpInstance
{
public:
template
<
typename
T
>
using
TensorPtr
=
std
::
unique_ptr
<
Tensor
<
T
>>
;
using
InTensorsTuple
=
std
::
tuple
<
TensorPtr
<
InArgTypes
>
...
>
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
using
DeviceBuffers
=
std
::
vector
<
DeviceMemPtr
>
;
OpInstance
()
=
default
;
OpInstance
(
const
OpInstance
&
)
=
default
;
OpInstance
&
operator
=
(
const
OpInstance
&
)
=
default
;
virtual
~
OpInstance
(){};
virtual
InTensorsTuple
GetInputTensors
()
const
=
0
;
virtual
TensorPtr
<
OutDataType
>
GetOutputTensor
()
const
=
0
;
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
(
tensor_operation
::
device
::
BaseOperator
*
)
const
=
0
;
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseArgument
>
MakeArgumentPointer
(
tensor_operation
::
device
::
BaseOperator
*
,
const
DeviceBuffers
&
,
const
DeviceMemPtr
&
)
const
=
0
;
virtual
std
::
size_t
GetFlops
()
const
=
0
;
virtual
std
::
size_t
GetBtype
()
const
=
0
;
};
/**
* @brief A generic operation instance run engine.
*/
template
<
typename
OutDataType
,
typename
...
InArgTypes
>
class
OpInstanceRunEngine
{
public:
using
OpInstanceT
=
OpInstance
<
InArgTypes
...,
OutDataType
>
;
template
<
typename
T
>
using
TensorPtr
=
std
::
unique_ptr
<
Tensor
<
T
>>
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
using
InTensorsTuple
=
std
::
tuple
<
TensorPtr
<
InArgTypes
>
...
>
;
using
DeviceBuffers
=
std
::
vector
<
DeviceMemPtr
>
;
using
InArgsTypesTuple
=
std
::
tuple
<
InArgTypes
...
>
;
OpInstanceRunEngine
()
=
delete
;
template
<
typename
ReferenceOp
=
std
::
function
<
void
()>
>
OpInstanceRunEngine
(
const
OpInstanceT
&
op_instance
,
const
ReferenceOp
&
reference_op
=
ReferenceOp
{})
:
op_instance_
{
op_instance
}
{
in_tensors_
=
op_instance_
.
GetInputTensors
();
out_tensor_
=
op_instance_
.
GetOutputTensor
();
if
constexpr
(
std
::
is_invocable_v
<
ReferenceOp
,
const
Tensor
<
InArgTypes
>&
...,
Tensor
<
OutDataType
>&>
)
{
ref_output_
=
op_instance_
.
GetOutputTensor
();
CallRefOpUnpackArgs
(
reference_op
,
std
::
make_index_sequence
<
kNInArgs_
>
{});
}
AllocateDeviceInputTensors
(
std
::
make_index_sequence
<
kNInArgs_
>
{});
out_device_buffer_
=
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
OutDataType
)
*
out_tensor_
->
mDesc
.
GetElementSpace
());
out_device_buffer_
->
SetZero
();
}
virtual
~
OpInstanceRunEngine
(){};
template
<
typename
OpInstancePtr
>
bool
Test
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
)
{
bool
res
{
true
};
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
invoker
=
op_instance_
.
MakeInvokerPointer
(
op_ptr
.
get
());
auto
argument
=
op_instance_
.
MakeArgumentPointer
(
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
invoker
->
Run
(
argument
.
get
());
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
{
throw
std
::
runtime_error
(
"OpInstanceRunEngine::Test: Reference value not availabe."
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
check_err
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
out_device_buffer_
->
SetZero
();
}
}
return
res
;
}
template
<
typename
OpInstancePtr
>
ProfileBestConfig
Profile
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
,
int
nrepeat
=
100
,
bool
do_verification
=
false
,
bool
do_log
=
false
)
{
bool
res
{
true
};
ProfileBestConfig
best_config
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
invoker
=
op_instance_
.
MakeInvokerPointer
(
op_ptr
.
get
());
auto
argument
=
op_instance_
.
MakeArgumentPointer
(
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
avg_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
std
::
size_t
flops
=
op_instance_
.
GetFlops
();
std
::
size_t
num_btype
=
op_instance_
.
GetBtype
();
float
tflops
=
static_cast
<
float
>
(
flops
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
<
best_config
.
best_tflops
)
{
best_config
.
best_op_name
=
op_name
;
best_config
.
best_tflops
=
tflops
;
best_config
.
best_gb_per_sec
=
gb_per_sec
;
best_config
.
best_avg_time
=
avg_time
;
}
if
(
do_verification
)
{
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
{
throw
std
::
runtime_error
(
"OpInstanceRunEngine::Profile: Reference value not availabe."
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
if
(
do_log
)
{}
}
out_device_buffer_
->
SetZero
();
}
}
return
best_config
;
}
void
SetAtol
(
double
a
)
{
atol_
=
a
;
}
void
SetRtol
(
double
r
)
{
rtol_
=
r
;
}
private:
template
<
typename
F
,
std
::
size_t
...
Is
>
void
CallRefOpUnpackArgs
(
const
F
&
f
,
std
::
index_sequence
<
Is
...
>
)
const
{
f
(
*
std
::
get
<
Is
>
(
in_tensors_
)...,
*
ref_output_
);
}
template
<
std
::
size_t
...
Is
>
void
AllocateDeviceInputTensors
(
std
::
index_sequence
<
Is
...
>
)
{
(
AllocateDeviceInputTensorsImpl
<
Is
>
(),
...);
}
template
<
std
::
size_t
Index
>
void
AllocateDeviceInputTensorsImpl
()
{
const
auto
&
ts
=
std
::
get
<
Index
>
(
in_tensors_
);
in_device_buffers_
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
std
::
tuple_element_t
<
Index
,
InArgsTypesTuple
>
)
*
ts
->
mDesc
.
GetElementSpace
()))
->
ToDevice
(
ts
->
mData
.
data
());
}
static
constexpr
std
::
size_t
kNInArgs_
=
std
::
tuple_size_v
<
InTensorsTuple
>
;
const
OpInstanceT
&
op_instance_
;
double
rtol_
{
1e-5
};
double
atol_
{
1e-8
};
InTensorsTuple
in_tensors_
;
TensorPtr
<
OutDataType
>
out_tensor_
;
TensorPtr
<
OutDataType
>
ref_output_
;
DeviceBuffers
in_device_buffers_
;
DeviceMemPtr
out_device_buffer_
;
template
<
typename
T
>
bool
CheckErr
(
const
std
::
vector
<
T
>&
dev_out
,
const
std
::
vector
<
T
>&
ref_out
)
const
{
return
ck
::
utils
::
check_err
(
dev_out
,
ref_out
,
"Error: incorrect results!"
,
atol_
,
rtol_
);
}
};
}
// namespace utils
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
View file @
8c03672b
...
@@ -28,19 +28,19 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances = std::tuple<
...
@@ -28,19 +28,19 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances = std::tuple<
//#####################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//#####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
256
,
32
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
256
,
16
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
128
,
32
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
128
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
64
,
128
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
128
,
32
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
32
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
32
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
32
,
128
,
32
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
32
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
32
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
32
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
32
,
64
,
32
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
32
,
64
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
// clang-format on
// clang-format on
>
;
>
;
...
...
library/src/utility/CMakeLists.txt
0 → 100644
View file @
8c03672b
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/device
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/element
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
)
set
(
CONV_FWD_UTIL_SOURCE
conv_fwd_util.cpp
)
add_library
(
conv_fwd_util SHARED
${
CONV_FWD_UTIL_SOURCE
}
)
target_link_libraries
(
conv_fwd_util PRIVATE host_tensor
)
target_compile_features
(
conv_fwd_util PUBLIC
)
set_target_properties
(
conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
conv_fwd_util SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
clang_tidy_check
(
conv_fwd_util
)
library/src/utility/conv_fwd_util.cpp
0 → 100644
View file @
8c03672b
#include "conv_fwd_util.hpp"
namespace
ck
{
namespace
utils
{
namespace
conv
{
/**
* @brief Calculate number of FLOPs for Convolution
*
* @param[in] N Batch size.
* @param[in] C Number of input channels.
* @param[in] K Number of output channels.
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
* @param[in] output_spatial_lengths Convolution output spatial dimensions
* lengths.
*
* @return The number of flops.
*/
std
::
size_t
get_flops
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
)
{
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
N
*
K
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
C
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
}
ConvParams
::
ConvParams
()
:
num_dim_spatial
(
2
),
N
(
128
),
K
(
256
),
C
(
192
),
filter_spatial_lengths
(
2
,
3
),
input_spatial_lengths
(
2
,
71
),
conv_filter_strides
(
2
,
2
),
conv_filter_dilations
(
2
,
1
),
input_left_pads
(
2
,
1
),
input_right_pads
(
2
,
1
)
{
}
ConvParams
::
ConvParams
(
ck
::
index_t
n_dim
,
ck
::
index_t
n_batch
,
ck
::
index_t
n_out_channels
,
ck
::
index_t
n_in_channels
,
const
std
::
vector
<
ck
::
index_t
>&
filters_len
,
const
std
::
vector
<
ck
::
index_t
>&
input_len
,
const
std
::
vector
<
ck
::
index_t
>&
strides
,
const
std
::
vector
<
ck
::
index_t
>&
dilations
,
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
right_pads
)
:
num_dim_spatial
(
n_dim
),
N
(
n_batch
),
K
(
n_out_channels
),
C
(
n_in_channels
),
filter_spatial_lengths
(
filters_len
),
input_spatial_lengths
(
input_len
),
conv_filter_strides
(
strides
),
conv_filter_dilations
(
dilations
),
input_left_pads
(
left_pads
),
input_right_pads
(
right_pads
)
{
if
(
ck
::
type_convert
<
ck
::
index_t
>
(
filter_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_strides
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_dilations
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_left_pads
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_right_pads
.
size
())
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
}
std
::
vector
<
ck
::
index_t
>
ConvParams
::
GetOutputSpatialLengths
()
const
{
if
(
ck
::
type_convert
<
ck
::
index_t
>
(
filter_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_spatial_lengths
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_strides
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
conv_filter_dilations
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_left_pads
.
size
())
!=
num_dim_spatial
||
ck
::
type_convert
<
ck
::
index_t
>
(
input_right_pads
.
size
())
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
idx_eff
=
(
filter_spatial_lengths
[
i
]
-
1
)
*
conv_filter_dilations
[
i
]
+
1
;
out_spatial_len
[
i
]
=
(
input_spatial_lengths
[
i
]
+
input_left_pads
[
i
]
+
input_right_pads
[
i
]
-
idx_eff
)
/
conv_filter_strides
[
i
]
+
1
;
}
return
out_spatial_len
;
}
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
arg_idx
,
char
*
const
argv
[])
{
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
}
HostTensorDescriptor
get_output_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWK
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWK
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWK
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KZYXC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KYXC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KXC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
}
// namespace conv
}
// namespace utils
}
// namespace ck
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
ck
::
utils
::
conv
::
ConvParams
&
p
)
{
os
<<
"ConvParams {"
<<
"
\n
num_dim_spatial: "
<<
p
.
num_dim_spatial
<<
"
\n
N: "
<<
p
.
N
<<
"
\n
K: "
<<
p
.
K
<<
"
\n
C: "
<<
p
.
C
<<
"
\n
filter_spatial_lengths: "
<<
p
.
filter_spatial_lengths
<<
"
\n
input_spatial_lengths: "
<<
p
.
input_spatial_lengths
<<
"
\n
conv_filter_strides: "
<<
p
.
conv_filter_strides
<<
"
\n
conv_filter_dilations: "
<<
p
.
conv_filter_dilations
<<
"
\n
input_left_pads: "
<<
p
.
input_left_pads
<<
"
\n
input_right_pads: "
<<
p
.
input_right_pads
;
return
os
;
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment