Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1c62b47b
"git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "2103a5073c2e3de3e2bedeb3f406354343af0885"
Commit
1c62b47b
authored
Feb 22, 2021
by
Chao Liu
Browse files
clean up
parent
c9b52bce
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
0 additions
and
1964 deletions
+0
-1964
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
...rnel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
+0
-133
composable_kernel/include/kernel_algorithm/gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp
...nel_algorithm/gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp
+0
-255
composable_kernel/include/tensor_description/dynamic_tensor_coordinate_v1.hpp
...clude/tensor_description/dynamic_tensor_coordinate_v1.hpp
+0
-274
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v1.hpp
...ensor_description/dynamic_tensor_descriptor_helper_v1.hpp
+0
-35
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v1.hpp
...clude/tensor_description/dynamic_tensor_descriptor_v1.hpp
+0
-342
driver/CMakeLists.txt
driver/CMakeLists.txt
+0
-4
driver/include/device_col2im_eb_nchw.hpp
driver/include/device_col2im_eb_nchw.hpp
+0
-119
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
+0
-130
driver/include/host_col2im.hpp
driver/include/host_col2im.hpp
+0
-77
driver/src/col2im_driver.cpp
driver/src/col2im_driver.cpp
+0
-595
No files found.
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
deleted
100644 → 0
View file @
c9b52bce
#ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#define CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
namespace
ck
{
// B = merge(N, Ho, Wo)
template
<
index_t
GridSize
,
index_t
BlockSize
,
typename
Float
,
typename
ColGlobalDesc
,
typename
ImgGlobalDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
,
index_t
EPerBlock
,
index_t
BPerBlock
,
typename
BlockCopySubLengths_E_B
,
typename
BlockCopyClusterLengths_E_B
,
typename
BlockCopyThreadClusterArrangeOrder
,
typename
BlockCopySrcAccessOrder
,
typename
BlockCopyDstAccessOrder
,
index_t
BlockCopyDataPerAccess_B
>
struct
GridwiseCol2Im_eb_nchw
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_col_global
,
Float
*
const
__restrict__
p_img_global
)
const
{
constexpr
auto
col_e_b_global_desc
=
ColGlobalDesc
{};
constexpr
auto
img_n_c_hi_wi_global_desc
=
ImgGlobalDesc
{};
constexpr
index_t
N
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
0
];
constexpr
index_t
C
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
1
];
constexpr
index_t
Hi
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wi
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
3
];
constexpr
index_t
Ho
=
OutputSizes
{}[
0
];
constexpr
index_t
Wo
=
OutputSizes
{}[
1
];
constexpr
index_t
Y
=
FilterSizes
{}[
0
];
constexpr
index_t
X
=
FilterSizes
{}[
1
];
constexpr
index_t
ConvStrideH
=
ConvStrides
{}[
0
];
constexpr
index_t
ConvStrideW
=
ConvStrides
{}[
1
];
constexpr
index_t
ConvDilationH
=
ConvDilations
{}[
0
];
constexpr
index_t
ConvDilationW
=
ConvDilations
{}[
1
];
constexpr
index_t
E
=
C
*
Y
*
X
;
constexpr
index_t
B
=
N
*
Ho
*
Wo
;
// sanity-check for vectorized memory load
static_assert
((
Wo
==
1
||
(
ConvStrideW
==
1
||
BlockCopyDataPerAccess_B
==
1
))
&&
(
X
==
1
||
ConvDilationW
%
BlockCopyDataPerAccess_B
==
0
),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"
);
// divide block work by [E, B]
static_assert
(
E
%
EPerBlock
==
0
&&
B
%
BPerBlock
==
0
,
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
EBlockWork
=
E
/
EPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
EBlockWork
,
BBlockWork
>
{});
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
const
index_t
e_block_data_on_global
=
block_work_id
[
Number
<
0
>
{}]
*
EPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
Number
<
1
>
{}]
*
BPerBlock
;
// construct img_eb_global_desc
constexpr
auto
img_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
img_n_c_hi_wi_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}));
constexpr
index_t
Hip
=
img_n_c_hip_wip_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wip
=
img_n_c_hip_wip_global_desc
.
GetLengths
()[
3
];
constexpr
auto
img_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
img_n_c_hip_wip_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Embed
<
Hip
,
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Wip
,
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
img_e_b_global_desc
=
transform_tensor_descriptor
(
img_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// blockwise atomic accumulation
auto
blockwise_copy
=
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
decltype
(
col_e_b_global_desc
),
decltype
(
img_e_b_global_desc
),
Sequence
<
EPerBlock
,
BPerBlock
>
,
BlockCopySubLengths_E_B
,
BlockCopyClusterLengths_E_B
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_B
,
BlockCopyDataPerAccess_B
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
>
(
make_multi_index
(
e_block_data_on_global
,
b_block_data_on_global
),
make_multi_index
(
e_block_data_on_global
,
b_block_data_on_global
));
// blockwise copy
blockwise_copy
.
Run
(
p_col_global
,
p_img_global
);
}
};
}
// namespace ck
#endif
composable_kernel/include/kernel_algorithm/gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp
deleted
100644 → 0
View file @
c9b52bce
#ifndef CK_GRIDWISE_DYNAMIC_COL2IM_GEMMKGEMMN_NCHW_HPP
#define CK_GRIDWISE_DYNAMIC_COL2IM_GEMMKGEMMN_NCHW_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
typename
...
In
>
__host__
__device__
constexpr
auto
map_img_into_col
(
const
DynamicTensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
MultiIndex
<
2
>
out_sizes
,
const
MultiIndex
<
2
>
filter_sizes
,
const
MultiIndex
<
2
>
conv_strides
,
const
MultiIndex
<
2
>
conv_dilations
,
const
MultiIndex
<
2
>
in_left_pads
,
const
MultiIndex
<
2
>
in_right_pads
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
index_t
Ho
=
out_sizes
[
I0
];
const
index_t
Wo
=
out_sizes
[
I1
];
const
index_t
Y
=
filter_sizes
[
I0
];
const
index_t
X
=
filter_sizes
[
I1
];
const
index_t
ConvStrideH
=
conv_strides
[
I0
];
const
index_t
ConvStrideW
=
conv_strides
[
I1
];
const
index_t
ConvDilationH
=
conv_dilations
[
I0
];
const
index_t
ConvDilationW
=
conv_dilations
[
I1
];
const
index_t
InLeftPadH
=
in_left_pads
[
I0
];
const
index_t
InLeftPadW
=
in_left_pads
[
I1
];
const
index_t
InRightPadH
=
in_right_pads
[
I0
];
const
index_t
InRightPadW
=
in_right_pads
[
I1
];
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor
(
transform_dynamic_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{})),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicRightPad
{
Hi
+
InLeftPadH
,
InRightPadH
},
DynamicRightPad
{
Wi
+
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
I2
);
const
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
I3
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicEmbed
<
2
>
{
make_multi_index
(
Y
,
Ho
),
make_multi_index
(
ConvDilationH
,
ConvStrideH
)},
DynamicEmbed
<
2
>
{
make_multi_index
(
X
,
Wo
),
make_multi_index
(
ConvDilationW
,
ConvStrideW
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
make_multi_index
(
C
,
Y
,
X
)},
DynamicMerge
<
3
>
{
make_multi_index
(
N
,
Ho
,
Wo
)}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmk_gemmn_global_desc
;
}
template
<
index_t
BlockSize
,
index_t
GemmKPerBlock
,
index_t
GemmNPerBlock
,
typename
BlockCopySubLengths_GemmK_GemmN
,
typename
BlockCopyClusterLengths_GemmK_GemmN
,
typename
BlockCopyThreadClusterArrangeOrder
,
typename
BlockCopySrcAccessOrder
,
typename
BlockCopyDstAccessOrder
,
index_t
BlockCopyDataPerAccess_GemmN
>
struct
GridwiseDynamicCol2Im_gemmkgemmn_nchw
{
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template
<
typename
...
Col
,
typename
...
Img
>
__device__
void
Run_r1
(
const
float
*
const
__restrict__
p_col_global
,
float
*
const
__restrict__
p_img_global
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
index_t
GemmK
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I0
);
const
index_t
GemmN
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I1
);
// divide block work by GemmN
const
index_t
GemmNBlockWork
=
GemmN
/
GemmNPerBlock
;
const
index_t
block_work_id
=
get_block_1d_id
();
const
index_t
gemmn_block_data_on_global
=
block_work_id
*
GemmNPerBlock
;
auto
blockwise_copy
=
#if 1
BlockwiseDynamicTensorSliceTransfer_v1r1
<
BlockSize
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
>
#elif 1
BlockwiseDynamicTensorSliceTransfer_v2r1
<
BlockSize
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
>
#endif
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
),
img_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
));
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
{
blockwise_copy
.
Run
(
p_col_global
,
p_img_global
);
blockwise_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
blockwise_copy
.
MoveDstSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
}
}
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template
<
typename
...
Col
,
typename
...
Img
>
__device__
void
Run_r2
(
const
float
*
const
__restrict__
p_col_global
,
float
*
const
__restrict__
p_img_global
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
index_t
GemmK
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I0
);
const
index_t
GemmN
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I1
);
// divide block work by GemmN
const
index_t
GemmNBlockWork
=
GemmN
/
GemmNPerBlock
;
const
index_t
block_work_id
=
get_block_1d_id
();
const
index_t
gemmn_block_data_on_global
=
block_work_id
*
GemmNPerBlock
;
auto
blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v2r2
<
BlockSize
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
>
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
),
img_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
));
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
{
blockwise_copy
.
Run
(
col_gemmk_gemmn_global_desc
,
p_col_global
,
img_gemmk_gemmn_global_desc
,
p_img_global
);
blockwise_copy
.
MoveSrcSliceWindow
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
blockwise_copy
.
MoveDstSliceWindow
(
img_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
}
}
template
<
typename
...
Col
,
typename
...
Img
>
__device__
void
Run
(
const
float
*
const
__restrict__
p_col_global
,
float
*
const
__restrict__
p_img_global
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
{
Run_r2
(
p_col_global
,
p_img_global
,
col_gemmk_gemmn_global_desc
,
img_gemmk_gemmn_global_desc
);
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_coordinate_v1.hpp
deleted
100644 → 0
View file @
c9b52bce
#ifndef CK_DYNAMIC_TENSOR_COORDINATE_V1_HPP
#define CK_DYNAMIC_TENSOR_COORDINATE_V1_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor_v1.hpp"
namespace
ck
{
// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
// At the bare minimun, user should be able to query the following information from a tensor
// coordinate:
// 1. Tensor descriptor
// 2. Location, represented in the form of multi-index
// 3. Location, represented in the form of the offset to the origin of the tensor
// 4. If the location is inside invalid area or not, e.g. the padding area of an implicitly padded
// tensor is considered invalid, because the padding area doesn't have any physical memory
// allocation
// A tensor cooridnate also provides following functionality:
// 1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
// can freely move the "point of location" inside the tensor
// wrapper class for DynamicNativeTensorCoordinate_v1 and DynamicTransformedTensorCoordinate_v1
template
<
typename
TensorDesc
>
struct
DynamicTensorCoordinate_v1
;
// tensor coordinate for native tensor
template
<
typename
TensorDesc
>
struct
DynamicNativeTensorCoordinate_v1
{
using
type
=
DynamicNativeTensorCoordinate_v1
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
NDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
NDim
>
;
__host__
__device__
constexpr
DynamicNativeTensorCoordinate_v1
(
const
tensor_desc_type
&
tensor_desc
,
const
Index
&
idx
)
:
tensor_desc_
{
tensor_desc
},
idx_
{
idx
},
offset_
{
tensor_desc
.
CalculateOffset
(
idx
)}
{
}
__host__
__device__
constexpr
auto
GetTensorDescriptor
()
const
{
return
tensor_desc_
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperIndex
()
const
{
return
idx_
;
}
__host__
__device__
constexpr
const
auto
&
GetIndex
()
const
{
return
idx_
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
offset_
;
}
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_
+=
idx_diff
;
offset_
+=
tensor_desc_
.
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_
-=
idx_diff
;
offset_
-=
tensor_desc_
.
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
-=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
index_t
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
const
{
return
tensor_desc_
.
CalculateOffsetDiff
(
idx_diff
);
}
// evaluated at run-time
__host__
__device__
constexpr
bool
IsUpperIndexValid
()
const
{
return
tensor_desc_
.
IsUpperIndexValid
(
idx_
);
}
// evaluated at run-time
__host__
__device__
constexpr
bool
IsOffsetValid
()
const
{
// For native tensor, offset is valid if upper-index is valid
return
IsUpperIndexValid
();
}
// evaluated at compile-time
__host__
__device__
static
constexpr
bool
IsOffsetValidAssumingUpperIndexIsValid
()
{
return
true
;
}
private:
const
tensor_desc_type
tensor_desc_
;
// idx_ may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
idx_
;
index_t
offset_
;
};
// tensor coordinate for transformed tensor
template
<
typename
TensorDesc
>
struct
DynamicTransformedTensorCoordinate_v1
{
static
constexpr
index_t
NDimUp
=
TensorDesc
::
GetNumOfDimension
();
using
UpperDesc
=
TensorDesc
;
using
UpperCoord
=
DynamicTransformedTensorCoordinate_v1
;
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
using
LowerDesc
=
typename
UpperDesc
::
LowerDesc
;
using
LowerCoord
=
typename
DynamicTensorCoordinate_v1
<
LowerDesc
>::
type
;
__host__
__device__
constexpr
DynamicTransformedTensorCoordinate_v1
(
const
UpperDesc
&
tensor_desc_up
,
const
UpperIndex
&
idx_up
)
:
tensor_desc_up_
{
tensor_desc_up
},
idx_up_
{
idx_up
},
coord_low_
{
tensor_desc_up
.
GetLowerTensorDescriptor
(),
tensor_desc_up
.
CalculateLowerIndex
(
idx_up
)}
{
}
__host__
__device__
constexpr
auto
GetTensorDescriptor
()
const
{
return
tensor_desc_up_
;
}
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
coord_low_
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
idx_up_
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetIndex
()
const
{
return
idx_up_
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
GetLowerCoordinate
().
GetOffset
();
}
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
coord_low_
+=
tensor_desc_up_
.
CalculateLowerIndexDiff
(
idx_up_diff
,
GetLowerCoordinate
().
GetIndex
(),
GetIndex
());
// idx_up_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_
+=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
UpperIndex
&
idx_up_diff
)
{
coord_low_
-=
tensor_desc_up_
.
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_
-=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
+=
idx_up_diff
;
return
coord_up
;
}
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
-=
idx_up_diff
;
return
coord_up
;
}
// Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time.
// TODO: this function is not compiled to expected ISA
__host__
__device__
constexpr
index_t
CalculateOffsetDiff
(
const
UpperIndex
&
idx_up_diff
)
const
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
const
auto
idx_low_diff
=
tensor_desc_up_
.
CalculateLowerIndexDiff
(
idx_up_diff
,
coord_low_
.
GetIndex
(),
idx_up_
);
return
coord_low_
.
CalculateOffsetDiff
(
idx_low_diff
);
}
// evaluated at run-time
__host__
__device__
constexpr
bool
IsUpperIndexValid
()
const
{
return
tensor_desc_up_
.
IsUpperIndexValid
(
idx_up_
);
}
// evaluted at run-time
__host__
__device__
constexpr
bool
IsOffsetValid
()
const
{
return
IsUpperIndexValid
()
&&
coord_low_
.
IsOffsetValidAssumingUpperIndexIsValid
();
}
// most evaluatation is done at comile-time
__host__
__device__
constexpr
bool
IsOffsetValidAssumingUpperIndexIsValid
()
const
{
return
tensor_desc_up_
.
IsValidUpperIndexMappedToValidLowerIndex
(
idx_up_
)
&&
coord_low_
.
IsOffsetValidAssumingUpperIndexIsValid
();
}
private:
const
UpperDesc
tensor_desc_up_
;
// idx_up_ may be calculated and updated, however, the value of some (or all) of its entries
// may never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
UpperIndex
idx_up_
;
LowerCoord
coord_low_
;
};
template
<
index_t
NDim
>
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate_v1
(
const
DynamicNativeTensorDescriptor_v1
<
NDim
>&
tensor_desc
,
const
MultiIndex
<
NDim
>&
idx
)
{
return
DynamicNativeTensorCoordinate_v1
<
DynamicNativeTensorDescriptor_v1
<
NDim
>>
{
tensor_desc
,
idx
};
}
template
<
index_t
NDim
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate_v1
(
const
DynamicTransformedTensorDescriptor_v1
<
Ts
...
>&
tensor_desc
,
const
MultiIndex
<
NDim
>&
idx
)
{
static_assert
(
DynamicTransformedTensorDescriptor_v1
<
Ts
...
>::
GetNumOfDimension
()
==
NDim
,
"wrong! inconsistent # of dimensions"
);
return
DynamicTransformedTensorCoordinate_v1
<
DynamicTransformedTensorDescriptor_v1
<
Ts
...
>>
{
tensor_desc
,
idx
};
}
template
<
typename
TensorDesc
>
struct
DynamicTensorCoordinate_v1
{
static
constexpr
index_t
NDim
=
TensorDesc
::
GetNumOfDimension
();
using
type
=
decltype
(
make_dynamic_tensor_coordinate_v1
<
NDim
>
(
TensorDesc
{},
MultiIndex
<
NDim
>
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v1.hpp
deleted
100644 → 0
View file @
c9b52bce
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V1_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V1_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor_v1.hpp"
namespace
ck
{
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
auto
make_dynamic_naive_tensor_descriptor_v1
(
const
Lengths
&
lengths
,
const
Strides
&
strides
)
{
static_assert
(
Lengths
::
Size
()
==
Strides
::
Size
(),
"wrong! Size not the same"
);
return
DynamicNativeTensorDescriptor_v1
<
Lengths
::
Size
()
>
(
lengths
,
strides
);
}
template
<
typename
LowTensorDescriptor
,
typename
Transforms
,
typename
LowDimensionIds
,
typename
UpDimensionIds
>
__host__
__device__
constexpr
auto
transform_dynamic_tensor_descriptor_v1
(
const
LowTensorDescriptor
&
low_tensor_desc
,
const
Transforms
&
transforms
,
LowDimensionIds
,
UpDimensionIds
)
{
return
DynamicTransformedTensorDescriptor_v1
<
LowTensorDescriptor
,
Transforms
,
LowDimensionIds
,
UpDimensionIds
>
{
low_tensor_desc
,
transforms
};
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v1.hpp
deleted
100644 → 0
View file @
c9b52bce
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_V1_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_V1_HPP
#include "common_header.hpp"
#include "dynamic_multi_index_transform.hpp"
namespace
ck
{
template
<
index_t
NDim
>
struct
DynamicNativeTensorDescriptor_v1
{
using
Index
=
MultiIndex
<
NDim
>
;
const
Index
lengths_
;
const
Index
strides_
;
__host__
__device__
constexpr
DynamicNativeTensorDescriptor_v1
(
const
Index
&
lengths
,
const
Index
&
strides
)
:
lengths_
{
lengths
},
strides_
{
strides
}
{
}
__host__
__device__
constexpr
DynamicNativeTensorDescriptor_v1
()
:
lengths_
{
make_zero_multi_index
<
NDim
>
()},
strides_
{
make_zero_multi_index
<
NDim
>
()}
{
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
return
NDim
;
}
__host__
__device__
constexpr
auto
GetLengths
()
const
{
return
lengths_
;
}
__host__
__device__
constexpr
auto
GetStrides
()
const
{
return
strides_
;
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
index_t
GetLength
(
Number
<
IDim
>
)
const
{
return
lengths_
[
Number
<
IDim
>
{}];
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
index_t
GetStride
(
Number
<
IDim
>
)
const
{
return
strides_
[
Number
<
IDim
>
{}];
}
__host__
__device__
constexpr
index_t
GetElementSize
()
const
{
return
container_reduce
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
index_t
{
1
});
}
__host__
__device__
constexpr
index_t
GetElementSpace
()
const
{
index_t
space
=
1
;
static_for
<
0
,
NDim
,
1
>
{}([
&
](
auto
i
)
{
space
+=
(
GetLength
(
i
)
-
1
)
*
GetStride
(
i
);
});
return
space
;
}
template
<
typename
Idx
>
__host__
__device__
constexpr
index_t
CalculateOffset
(
const
Idx
&
idx
)
const
{
index_t
offset
=
0
;
static_for
<
0
,
NDim
,
1
>
{}([
&
](
auto
i
)
{
offset
+=
idx
[
i
]
*
GetStride
(
i
);
});
return
offset
;
}
template
<
typename
IdxDiff
>
__host__
__device__
constexpr
index_t
CalculateOffsetDiff
(
const
IdxDiff
&
idx_diff
)
const
{
return
CalculateOffset
(
idx_diff
);
}
template
<
typename
Idx
>
__host__
__device__
constexpr
bool
IsUpperIndexValid
(
const
Idx
&
idx
)
const
{
bool
flag
=
true
;
static_for
<
0
,
NDim
,
1
>
{}(
[
&
](
auto
i
)
{
flag
=
flag
&&
idx
[
i
]
>=
0
&&
idx
[
i
]
<
GetLength
(
i
);
});
return
flag
;
}
};
template
<
typename
LowTensorDescriptor
,
// DynamicNativeTensorDescriptor_v1 or
// DynamicTransformedTensorDescriptor_v1
typename
Transforms
,
// Tuple<MultIndexTransforms...>
typename
LowDimensionIds
,
// Tuple<Sequence<...>>
typename
UpDimensionIds
>
// Tuple<Sequence<...>>
struct
DynamicTransformedTensorDescriptor_v1
{
using
LowerDesc
=
LowTensorDescriptor
;
using
UpperDesc
=
DynamicTransformedTensorDescriptor_v1
;
static
constexpr
index_t
NTransform
=
Transforms
::
Size
();
const
LowerDesc
low_tensor_desc_
;
const
Transforms
transforms_
;
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
LowerDesc
::
GetNumOfDimension
();
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
index_t
ndim_up
=
0
;
static_for
<
0
,
NTransform
,
1
>
{}([
&
](
auto
i
)
constexpr
{
constexpr
auto
tmp
=
UpDimensionIds
{}.
At
(
i
);
ndim_up
+=
decltype
(
tmp
)
::
Size
();
});
return
ndim_up
;
}
static
constexpr
index_t
NDimUp
=
GetNumOfUpperDimension
();
static
constexpr
index_t
NDimLow
=
GetNumOfLowerDimension
();
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
struct
lambda_merge_sequences
{
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
operator
()(
Xs
...
xs
)
const
{
return
merge_sequences
(
xs
...);
}
};
struct
lambda_merge_arrays
{
template
<
typename
...
Xs
>
__host__
__device__
constexpr
auto
operator
()(
Xs
...
xs
)
const
{
return
container_cat
(
xs
...);
}
};
__host__
__device__
constexpr
DynamicTransformedTensorDescriptor_v1
(
const
LowerDesc
&
low_tensor_desc
,
const
Transforms
&
transforms
)
:
low_tensor_desc_
{
low_tensor_desc
},
transforms_
{
transforms
}
{
static_assert
(
NTransform
==
Transforms
::
Size
()
&&
NTransform
==
LowDimensionIds
::
Size
()
&&
NTransform
==
UpDimensionIds
::
Size
(),
"wrong! # of transformations not the same"
);
// sanity check:
// LowDimensionIds should include all low-dimensions,
// UpDimensionIds should include all up-dimensions
using
unsorted_up_dimension_ids
=
decltype
(
unpack
(
lambda_merge_sequences
{},
UpDimensionIds
{}));
using
sorted_up_dimension_ids
=
typename
sequence_sort
<
unsorted_up_dimension_ids
,
math
::
less
<
index_t
>>::
type
;
static_assert
(
sorted_up_dimension_ids
::
Size
()
==
NDimUp
&&
is_valid_sequence_map
<
sorted_up_dimension_ids
>
{},
"wrong! UpDimensionIds is not configured correctly"
);
using
unsorted_low_dimension_ids
=
decltype
(
unpack
(
lambda_merge_sequences
{},
LowDimensionIds
{}));
using
sorted_low_dimension_ids
=
typename
sequence_sort
<
unsorted_low_dimension_ids
,
math
::
less
<
index_t
>>::
type
;
static_assert
(
sorted_low_dimension_ids
::
Size
()
==
NDimLow
&&
is_valid_sequence_map
<
sorted_low_dimension_ids
>
{},
"wrong! LowDimensionIds is not configured correctly"
);
// TODO: sanity check: while a up-dimension could be associated with
// multille
// transformation, a low-dimension should be associated with only one
// transformation
// TODO: sanity-check: GetLowerLengths of each transform should be
// consistent with lengths
// of lower-tensor-descriptor
}
__host__
__device__
constexpr
DynamicTransformedTensorDescriptor_v1
()
:
low_tensor_desc_
{},
transforms_
{}
{
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
return
GetNumOfUpperDimension
();
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
// sort upper-dimension-ids
constexpr
auto
unsorted_up_dimension_ids
=
unpack
(
lambda_merge_sequences
{},
UpDimensionIds
{});
using
sort_up_dimension_ids
=
sequence_unique_sort
<
decltype
(
unsorted_up_dimension_ids
),
math
::
less
<
index_t
>
,
math
::
equal
<
index_t
>>
;
constexpr
auto
sorted2unsorted_map
=
typename
sort_up_dimension_ids
::
sorted2unsorted_map
{};
// sort upper-lengths
const
auto
tuple_of_up_lengths
=
transform_tuples
([](
const
auto
&
tran
)
constexpr
{
return
tran
.
GetUpperLengths
();
},
transforms_
);
const
auto
unsorted_up_lengths
=
unpack
(
lambda_merge_arrays
{},
tuple_of_up_lengths
);
const
auto
sorted_up_lengths
=
container_reorder_given_new2old
(
unsorted_up_lengths
,
sorted2unsorted_map
);
return
sorted_up_lengths
;
}
__host__
__device__
constexpr
auto
GetLengths
()
const
{
return
GetUpperLengths
();
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
index_t
GetLength
(
Number
<
IDim
>
)
const
{
return
GetLengths
()[
Number
<
IDim
>
{}];
}
__host__
__device__
constexpr
index_t
GetElementSize
()
const
{
return
container_reduce
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
index_t
{
1
});
}
__host__
__device__
constexpr
index_t
GetElementSpace
()
const
{
return
low_tensor_desc_
.
GetElementSpace
();
}
__host__
__device__
constexpr
auto
GetLowerTensorDescriptor
()
const
{
return
low_tensor_desc_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
const
{
static_for
<
0
,
NTransform
,
1
>
{}([
&
](
auto
itran
)
constexpr
{
const
auto
tran
=
transforms_
.
At
(
itran
);
const
auto
idx_up_part
=
pick_container_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_low_part
=
pick_container_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
tran
.
CalculateLowerIndex
(
idx_low_part
,
idx_up_part
);
});
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_low_diff
,
const
UpIdxDiff
&
idx_up_diff
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
)
const
{
static_for
<
0
,
NTransform
,
1
>
{}([
&
](
auto
itran
)
{
const
auto
tran
=
transforms_
.
At
(
itran
);
const
auto
idx_up_diff_part
=
pick_container_element
(
idx_up_diff
,
UpDimensionIds
{}.
At
(
itran
));
const
auto
idx_up_old_part
=
pick_container_element
(
idx_up_old
,
UpDimensionIds
{}.
At
(
itran
));
const
auto
idx_low_old_part
=
pick_container_element
(
idx_low_old
,
LowDimensionIds
{}.
At
(
itran
));
auto
idx_low_diff_part
=
pick_container_element
(
idx_low_diff
,
LowDimensionIds
{}.
At
(
itran
));
tran
.
CalculateLowerIndexDiff
(
idx_low_diff_part
,
idx_up_diff_part
,
idx_low_old_part
,
idx_up_old_part
);
});
}
template
<
typename
UpIdx
>
__host__
__device__
constexpr
auto
CalculateLowerIndex
(
const
UpIdx
&
idx_up
)
const
{
LowerIndex
idx_low
;
CalculateLowerIndex
(
idx_low
,
idx_up
);
return
idx_low
;
}
template
<
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
auto
CalculateLowerIndexDiff
(
const
UpIdxDiff
&
idx_up_diff
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
)
const
{
LowerIndex
idx_low_diff
;
CalculateLowerIndexDiff
(
idx_low_diff
,
idx_up_diff
,
idx_low_old
,
idx_up_old
);
return
idx_low_diff
;
}
__host__
__device__
constexpr
index_t
CalculateOffset
(
const
UpperIndex
&
idx_up
)
const
{
return
low_tensor_desc_
.
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
}
__host__
__device__
constexpr
bool
IsUpperIndexValid
(
const
UpperIndex
&
idx_up
)
const
{
bool
flag
=
true
;
static_for
<
0
,
NDimUp
,
1
>
{}(
[
&
](
auto
i
)
{
flag
=
flag
&&
idx_up
[
i
]
>=
0
&&
idx_up
[
i
]
<
GetLength
(
i
);
});
return
flag
;
}
__host__
__device__
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpperIndex
&
idx_up
)
const
{
bool
flag
=
true
;
static_for
<
0
,
NTransform
,
1
>
{}([
&
](
auto
itran
)
{
const
auto
tran
=
Transforms
{}.
At
(
itran
);
// check a indtransformation if it does not always has a valid mapping
constexpr
bool
is_valid_up_always_mapped_to_valid_low
=
decltype
(
tran
)
::
IsValidUpperIndexAlwaysMappedToValidLowerIndex
();
if
constexpr
(
!
is_valid_up_always_mapped_to_valid_low
)
{
const
auto
up_dims_part
=
UpDimensionIds
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_container_element
(
idx_up
,
up_dims_part
);
flag
=
flag
&&
tran
.
IsValidUpperIndexMappedToValidLowerIndex
(
idx_up_part
);
}
});
return
flag
;
}
};
}
// namespace ck
#endif
driver/CMakeLists.txt
View file @
1c62b47b
...
@@ -17,17 +17,13 @@ install(TARGETS host LIBRARY DESTINATION lib)
...
@@ -17,17 +17,13 @@ install(TARGETS host LIBRARY DESTINATION lib)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
set
(
CONV_SOURCE src/conv_driver.cpp
)
set
(
CONV_SOURCE src/conv_driver.cpp
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp
)
set
(
COL2IM_SOURCE src/col2im_driver.cpp
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
set
(
CONV_SOURCE src/conv_driver.cu
)
set
(
CONV_SOURCE src/conv_driver.cu
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu
)
set
(
COL2IM_SOURCE src/col2im_driver.cu
)
endif
()
endif
()
add_executable
(
conv_driver
${
CONV_SOURCE
}
)
add_executable
(
conv_driver
${
CONV_SOURCE
}
)
add_executable
(
conv_bwd_data_driver
${
CONV_BWD_DATA_SOURCE
}
)
add_executable
(
conv_bwd_data_driver
${
CONV_BWD_DATA_SOURCE
}
)
add_executable
(
col2im_driver
${
COL2IM_SOURCE
}
)
target_link_libraries
(
conv_driver PRIVATE host
)
target_link_libraries
(
conv_driver PRIVATE host
)
target_link_libraries
(
conv_bwd_data_driver PRIVATE host
)
target_link_libraries
(
conv_bwd_data_driver PRIVATE host
)
target_link_libraries
(
col2im_driver PRIVATE host
)
driver/include/device_col2im_eb_nchw.hpp
deleted
100644 → 0
View file @
c9b52bce
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_col2im_eb_nchw.hpp"
template
<
typename
T
,
typename
ColDesc
,
typename
ImgDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
>
void
device_col2im_eb_nchw
(
ColDesc
,
const
Tensor
<
T
>&
col_eb
,
ImgDesc
,
Tensor
<
T
>&
img_nchw
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
,
std
::
size_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
col_eb_desc
=
ColDesc
{};
constexpr
auto
img_nchw_desc
=
ImgDesc
{};
constexpr
index_t
N
=
img_nchw_desc
.
GetLengths
()[
0
];
constexpr
index_t
C
=
img_nchw_desc
.
GetLengths
()[
1
];
constexpr
index_t
Hi
=
img_nchw_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wi
=
img_nchw_desc
.
GetLengths
()[
3
];
constexpr
index_t
E
=
col_eb_desc
.
GetLengths
()[
0
];
constexpr
index_t
B
=
col_eb_desc
.
GetLengths
()[
1
];
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
col_eb_device_buf
(
data_sz
*
col_eb
.
mDesc
.
GetElementSpace
());
DeviceMem
img_nchw_device_buf
(
data_sz
*
img_nchw
.
mDesc
.
GetElementSpace
());
col_eb_device_buf
.
ToDevice
(
col_eb
.
mData
.
data
());
img_nchw_device_buf
.
ToDevice
(
img_nchw
.
mData
.
data
());
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
EPerBlock
=
128
;
constexpr
index_t
BPerBlock
=
128
;
using
BlockCopySubLengths_E_B
=
Sequence
<
8
,
8
>
;
using
BlockCopyClusterLengths_E_B
=
Sequence
<
16
,
16
>
;
using
BlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
BlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
BlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
BlockCopyDataPerAccess_B
=
1
;
#endif
constexpr
index_t
GridSize
=
((
E
+
EPerBlock
-
1
)
/
EPerBlock
)
*
((
B
+
BPerBlock
-
1
)
/
BPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_col2im
=
GridwiseCol2Im_eb_nchw
<
GridSize
,
BlockSize
,
T
,
ColDesc
,
ImgDesc
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
,
EPerBlock
,
BPerBlock
,
BlockCopySubLengths_E_B
,
BlockCopyClusterLengths_E_B
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
BlockCopyDataPerAccess_B
>
{};
for
(
index_t
i
=
0
;
i
<
1
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
decltype
(
gridwise_col2im
),
const
T
*
const
__restrict__
,
T
*
const
__restrict__
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
const_cast
<
const
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
col_eb_device_buf
.
GetDeviceBuffer
())),
const_cast
<
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
img_nchw_device_buf
.
GetDeviceBuffer
())));
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
}
img_nchw_device_buf
.
FromDevice
(
img_nchw
.
mData
.
data
());
}
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
deleted
100644 → 0
View file @
c9b52bce
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp"
template
<
typename
T
,
typename
ColDesc
,
typename
ImgDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_dynamic_col2im_gemmkgemmn_nchw
(
ColDesc
,
const
Tensor
<
T
>&
col_gemmk_gemmn
,
ImgDesc
,
Tensor
<
T
>&
img_n_c_hi_wi
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
std
::
size_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
col_gemmk_gemmn_device_buf
(
data_sz
*
col_gemmk_gemmn
.
mDesc
.
GetElementSpace
());
DeviceMem
img_n_c_hi_wi_device_buf
(
data_sz
*
img_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
col_gemmk_gemmn_device_buf
.
ToDevice
(
col_gemmk_gemmn
.
mData
.
data
());
img_n_c_hi_wi_device_buf
.
ToDevice
(
img_n_c_hi_wi
.
mData
.
data
());
const
auto
col_gemmk_gemmn_desc
=
make_dynamic_naive_tensor_descriptor
<
2
>
(
to_multi_index
(
ColDesc
::
GetLengths
()),
to_multi_index
(
ColDesc
::
GetStrides
()));
const
auto
img_n_c_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor
<
4
>
(
to_multi_index
(
ImgDesc
::
GetLengths
()),
to_multi_index
(
ImgDesc
::
GetStrides
()));
const
auto
filter_sizes
=
to_multi_index
(
FilterSizes
{});
const
auto
out_sizes
=
to_multi_index
(
OutputSizes
{});
const
auto
conv_strides
=
to_multi_index
(
ConvStrides
{});
const
auto
conv_dilations
=
to_multi_index
(
ConvDilations
{});
const
auto
in_left_pads
=
to_multi_index
(
InLeftPads
{});
const
auto
in_right_pads
=
to_multi_index
(
InRightPads
{});
const
auto
img_gemmk_gemmn_desc
=
map_img_into_col
(
img_n_c_hi_wi_desc
,
out_sizes
,
filter_sizes
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
index_t
GemmN
=
col_gemmk_gemmn_desc
.
GetLength
(
I1
);
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmNPerBlock
=
128
;
using
BlockCopySubLengths_GemmK_GemmN
=
Sequence
<
1
,
8
>
;
using
BlockCopyClusterLengths_GemmK_GemmN
=
Sequence
<
8
,
16
>
;
using
BlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
constexpr
index_t
BlockCopyDataPerAccess_GemmN
=
1
;
#endif
const
index_t
GridSize
=
GemmN
/
GemmNPerBlock
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_col2im
=
GridwiseDynamicCol2Im_gemmkgemmn_nchw
<
BlockSize
,
GemmKPerBlock
,
GemmNPerBlock
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
BlockCopyDataPerAccess_GemmN
>
{};
for
(
index_t
i
=
0
;
i
<
1
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
decltype
(
gridwise_col2im
),
const
T
*
const
__restrict__
,
T
*
const
__restrict__
,
decltype
(
col_gemmk_gemmn_desc
),
decltype
(
img_gemmk_gemmn_desc
)
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
const_cast
<
const
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
col_gemmk_gemmn_device_buf
.
GetDeviceBuffer
())),
const_cast
<
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
img_n_c_hi_wi_device_buf
.
GetDeviceBuffer
())),
col_gemmk_gemmn_desc
,
img_gemmk_gemmn_desc
);
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
}
img_n_c_hi_wi_device_buf
.
FromDevice
(
img_n_c_hi_wi
.
mData
.
data
());
}
driver/include/host_col2im.hpp
deleted
100644 → 0
View file @
c9b52bce
#pragma once
#include "host_tensor.hpp"
template
<
typename
T
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
>
void
host_col2im
(
const
Tensor
<
T
>&
in_eb
,
Tensor
<
T
>&
in_nchw
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
)
{
using
namespace
ck
;
int
N
=
in_nchw
.
mDesc
.
GetLengths
()[
0
];
int
C
=
in_nchw
.
mDesc
.
GetLengths
()[
1
];
int
Hi
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
int
Wi
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
int
Y
=
FilterSizes
{}[
0
];
int
X
=
FilterSizes
{}[
1
];
int
Ho
=
OutputSizes
{}[
0
];
int
Wo
=
OutputSizes
{}[
1
];
auto
f
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
double
v
=
0
;
for
(
int
y
=
0
;
y
<
Y
;
++
y
)
{
int
h_tmp
=
hi
+
LeftPads
{}[
0
]
-
y
*
ConvDilations
{}[
0
];
if
(
h_tmp
%
ConvStrides
{}[
0
]
==
0
)
{
int
ho
=
h_tmp
/
ConvStrides
{}[
0
];
if
(
ho
>=
0
&&
ho
<
Ho
)
{
for
(
int
x
=
0
;
x
<
X
;
++
x
)
{
int
w_tmp
=
wi
+
LeftPads
{}[
1
]
-
x
*
ConvDilations
{}[
1
];
if
(
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
{
int
wo
=
w_tmp
/
ConvStrides
{}[
1
];
if
(
wo
>=
0
&&
wo
<
Wo
&&
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
{
int
e
=
c
*
(
Y
*
X
)
+
y
*
X
+
x
;
int
b
=
n
*
(
Ho
*
Wo
)
+
ho
*
Wo
+
wo
;
v
+=
in_eb
(
e
,
b
);
}
}
}
}
}
}
in_nchw
(
n
,
c
,
hi
,
wi
)
=
v
;
};
auto
f_par
=
make_ParallelTensorFunctor
(
f
,
in_nchw
.
mDesc
.
GetLengths
()[
0
],
in_nchw
.
mDesc
.
GetLengths
()[
1
],
in_nchw
.
mDesc
.
GetLengths
()[
2
],
in_nchw
.
mDesc
.
GetLengths
()[
3
]);
f_par
(
std
::
thread
::
hardware_concurrency
());
}
driver/src/col2im_driver.cpp
deleted
100644 → 0
View file @
c9b52bce
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "host_col2im.hpp"
#include "device_col2im_eb_nchw.hpp"
#include "device_dynamic_col2im_gemmkgemmn_nchw.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
#if 1
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
71
;
constexpr
index_t
WI
=
71
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 1
// 1x1, 8x8
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 73x73
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
160
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 35x35
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
96
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
71
;
constexpr
index_t
WI
=
71
;
constexpr
index_t
K
=
192
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 7x1, 17x17
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif 1
// 1x7, 17x17
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
#elif 0
// 3x3, 299x299 stride=2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
3
;
constexpr
index_t
HI
=
299
;
constexpr
index_t
WI
=
299
;
constexpr
index_t
K
=
32
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 147x147
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
32
;
constexpr
index_t
HI
=
147
;
constexpr
index_t
WI
=
147
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 149x149
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
32
;
constexpr
index_t
HI
=
149
;
constexpr
index_t
WI
=
149
;
constexpr
index_t
K
=
32
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 17x17, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
192
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 35x35
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
384
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x3, 8x8
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
384
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
448
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
1
>
;
using
RightPads
=
Sequence
<
0
,
1
>
;
#elif 0
// 3x1, 8x8
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
448
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
0
>
;
using
RightPads
=
Sequence
<
1
,
0
>
;
#elif 0
// 3x3, 147x147
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
147
;
constexpr
index_t
WI
=
147
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 7x1, 73x73
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif 0
// 3x3, 73x73
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 14x14, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
2048
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 14x14
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 14x14, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 1
// 3x3, 28x28
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 14x14
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 1x1, 56x56, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 7x7, 230x230 stride=2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
3
;
constexpr
index_t
HI
=
230
;
constexpr
index_t
WI
=
230
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
7
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 28x28, stride = 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 28x28, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 7x7
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
2048
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 7x7
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 1x1, 56x56
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 56x56
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#endif
constexpr
auto
img_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
img_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
constexpr
index_t
HO
=
out_nkhw_desc
.
GetLengths
()[
2
];
constexpr
index_t
WO
=
out_nkhw_desc
.
GetLengths
()[
3
];
constexpr
auto
col_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
using
FilterSizes
=
Sequence
<
Y
,
X
>
;
using
OutputSizes
=
Sequence
<
HO
,
WO
>
;
ostream_tensor_descriptor
(
col_eb_desc
,
std
::
cout
<<
"col_eb_desc: "
);
ostream_tensor_descriptor
(
img_nchw_desc
,
std
::
cout
<<
"img_nchw_desc: "
);
print_array
(
"FilterSizes"
,
FilterSizes
{});
print_array
(
"OutputSizes"
,
OutputSizes
{});
print_array
(
"LeftPads"
,
LeftPads
{});
print_array
(
"LeftPads"
,
LeftPads
{});
print_array
(
"RightPads"
,
RightPads
{});
print_array
(
"ConvStrides"
,
ConvStrides
{});
print_array
(
"ConvDilations"
,
ConvDilations
{});
Tensor
<
float
>
col_eb
(
make_HostTensorDescriptor
(
col_eb_desc
));
Tensor
<
float
>
img_nchw_host
(
make_HostTensorDescriptor
(
img_nchw_desc
));
Tensor
<
float
>
img_nchw_device
(
make_HostTensorDescriptor
(
img_nchw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
argc
!=
3
)
{
printf
(
"arg1: do_verification, arg2: nrepeat
\n
"
);
exit
(
1
);
}
bool
do_verification
=
atoi
(
argv
[
1
]);
std
::
size_t
nrepeat
=
atoi
(
argv
[
2
]);
if
(
do_verification
)
{
#if 0
col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else
col_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
}
#if 0
device_col2im_eb_nchw(col_eb_desc,
col_eb,
img_nchw_desc,
img_nchw_device,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif
1
device_dynamic_col2im_gemmkgemmn_nchw
(
col_eb_desc
,
col_eb
,
img_nchw_desc
,
img_nchw_device
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#endif
if
(
do_verification
)
{
host_col2im
(
col_eb
,
img_nchw_host
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
check_error
(
img_nchw_host
,
img_nchw_device
);
#if 0
LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
#endif
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment