Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
711701f3
Commit
711701f3
authored
Oct 08, 2020
by
Chao Liu
Browse files
adding dynamic copy
parent
176d9fc0
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1092 additions
and
449 deletions
+1092
-449
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_eb_nchw.hpp
...lude/kernel_algorithm/dynamic_gridwise_col2im_eb_nchw.hpp
+0
-133
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
...nel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
+171
-0
composable_kernel/include/tensor_description/cluster_descriptor.hpp
..._kernel/include/tensor_description/cluster_descriptor.hpp
+48
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+14
-7
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+0
-36
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+165
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+1
-0
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+199
-0
driver/include/device_dynamic_col2im_eb_nchw.hpp
driver/include/device_dynamic_col2im_eb_nchw.hpp
+0
-120
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
+130
-0
driver/src/col2im_driver.cpp
driver/src/col2im_driver.cpp
+351
-140
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+13
-13
No files found.
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_eb_nchw.hpp
deleted
100644 → 0
View file @
176d9fc0
#ifndef CK_DYNAMIC_GRIDWISE_COL2IM_EB_NCHW_HPP
#define CK_DYNAMIC_GRIDWISE_COL2IM_EB_NCHW_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
namespace
ck
{
// B = merge(N, Ho, Wo)
template
<
index_t
GridSize
,
index_t
BlockSize
,
typename
Float
,
typename
ColGlobalDesc
,
typename
ImgGlobalDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
,
index_t
EPerBlock
,
index_t
BPerBlock
,
typename
BlockCopySubLengths_E_B
,
typename
BlockCopyClusterLengths_E_B
,
typename
BlockCopyThreadClusterArrangeOrder
,
typename
BlockCopySrcAccessOrder
,
typename
BlockCopyDstAccessOrder
,
index_t
BlockCopyDataPerAccess_B
>
struct
DynamicGridwiseCol2Im_eb_nchw
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_col_global
,
Float
*
const
__restrict__
p_img_global
)
const
{
constexpr
auto
col_e_b_global_desc
=
ColGlobalDesc
{};
constexpr
auto
img_n_c_hi_wi_global_desc
=
ImgGlobalDesc
{};
constexpr
index_t
N
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
0
];
constexpr
index_t
C
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
1
];
constexpr
index_t
Hi
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wi
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
3
];
constexpr
index_t
Ho
=
OutputSizes
{}[
0
];
constexpr
index_t
Wo
=
OutputSizes
{}[
1
];
constexpr
index_t
Y
=
FilterSizes
{}[
0
];
constexpr
index_t
X
=
FilterSizes
{}[
1
];
constexpr
index_t
ConvStrideH
=
ConvStrides
{}[
0
];
constexpr
index_t
ConvStrideW
=
ConvStrides
{}[
1
];
constexpr
index_t
ConvDilationH
=
ConvDilations
{}[
0
];
constexpr
index_t
ConvDilationW
=
ConvDilations
{}[
1
];
constexpr
index_t
E
=
C
*
Y
*
X
;
constexpr
index_t
B
=
N
*
Ho
*
Wo
;
// sanity-check for vectorized memory load
static_assert
((
Wo
==
1
||
(
ConvStrideW
==
1
||
BlockCopyDataPerAccess_B
==
1
))
&&
(
X
==
1
||
ConvDilationW
%
BlockCopyDataPerAccess_B
==
0
),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"
);
// divide block work by [E, B]
static_assert
(
E
%
EPerBlock
==
0
&&
B
%
BPerBlock
==
0
,
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
EBlockWork
=
E
/
EPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
EBlockWork
,
BBlockWork
>
{});
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
const
index_t
e_block_data_on_global
=
block_work_id
[
Number
<
0
>
{}]
*
EPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
Number
<
1
>
{}]
*
BPerBlock
;
// construct img_eb_global_desc
constexpr
auto
img_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
img_n_c_hi_wi_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}));
constexpr
index_t
Hip
=
img_n_c_hip_wip_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wip
=
img_n_c_hip_wip_global_desc
.
GetLengths
()[
3
];
constexpr
auto
img_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
img_n_c_hip_wip_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Embed
<
Hip
,
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Wip
,
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
img_e_b_global_desc
=
transform_tensor_descriptor
(
img_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// blockwise atomic accumulation
auto
blockwise_copy
=
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
decltype
(
col_e_b_global_desc
),
decltype
(
img_e_b_global_desc
),
Sequence
<
EPerBlock
,
BPerBlock
>
,
BlockCopySubLengths_E_B
,
BlockCopyClusterLengths_E_B
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_B
,
BlockCopyDataPerAccess_B
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
>
(
make_multi_index
(
e_block_data_on_global
,
b_block_data_on_global
),
make_multi_index
(
e_block_data_on_global
,
b_block_data_on_global
));
// blockwise copy
blockwise_copy
.
Run
(
p_col_global
,
p_img_global
);
}
};
}
// namespace ck
#endif
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
0 → 100644
View file @
711701f3
#ifndef CK_DYNAMIC_GRIDWISE_COL2IM_GEMMKGEMMN_NCHW_HPP
#define CK_DYNAMIC_GRIDWISE_COL2IM_GEMMKGEMMN_NCHW_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
typename
...
In
>
__host__
__device__
constexpr
auto
map_img_into_col
(
const
DynamicTensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
MultiIndex
<
2
>
out_sizes
,
const
MultiIndex
<
2
>
filter_sizes
,
const
MultiIndex
<
2
>
conv_strides
,
const
MultiIndex
<
2
>
conv_dilations
,
const
MultiIndex
<
2
>
in_left_pads
,
const
MultiIndex
<
2
>
in_right_pads
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
index_t
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
index_t
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
index_t
Ho
=
out_sizes
[
I0
];
const
index_t
Wo
=
out_sizes
[
I1
];
const
index_t
Y
=
filter_sizes
[
I0
];
const
index_t
X
=
filter_sizes
[
I1
];
const
index_t
ConvStrideH
=
conv_strides
[
I0
];
const
index_t
ConvStrideW
=
conv_strides
[
I1
];
const
index_t
ConvDilationH
=
conv_dilations
[
I0
];
const
index_t
ConvDilationW
=
conv_dilations
[
I1
];
const
index_t
InLeftPadH
=
in_left_pads
[
I0
];
const
index_t
InLeftPadW
=
in_left_pads
[
I1
];
const
index_t
InRightPadH
=
in_right_pads
[
I0
];
const
index_t
InRightPadW
=
in_right_pads
[
I1
];
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor
(
transform_dynamic_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{})),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicRightPad
{
Hi
+
InLeftPadH
,
InRightPadH
},
DynamicRightPad
{
Wi
+
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
I2
);
const
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLength
(
I3
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicEmbed
<
2
>
{
make_multi_index
(
Y
,
Ho
),
make_multi_index
(
ConvDilationH
,
ConvStrideH
)},
DynamicEmbed
<
2
>
{
make_multi_index
(
X
,
Wo
),
make_multi_index
(
ConvDilationW
,
ConvStrideW
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
make_multi_index
(
C
,
Y
,
X
)},
DynamicMerge
<
3
>
{
make_multi_index
(
N
,
Ho
,
Wo
)}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
in_gemmk_gemmn_global_desc
;
}
template
<
index_t
BlockSize
,
index_t
GemmKPerBlock
,
index_t
GemmNPerBlock
,
typename
BlockCopySubLengths_GemmK_GemmN
,
typename
BlockCopyClusterLengths_GemmK_GemmN
,
typename
BlockCopyThreadClusterArrangeOrder
,
typename
BlockCopySrcAccessOrder
,
typename
BlockCopyDstAccessOrder
,
index_t
BlockCopyDataPerAccess_GemmN
>
struct
DynamicGridwiseCol2Im_gemmkgemmn_nchw
{
template
<
typename
...
Col
,
typename
...
Img
>
__device__
void
Run
(
const
float
*
const
__restrict__
p_col_global
,
float
*
const
__restrict__
p_img_global
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
index_t
GemmK
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I0
);
const
index_t
GemmN
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I1
);
// divide block work by GemmN
const
index_t
GemmNBlockWork
=
GemmN
/
GemmNPerBlock
;
const
index_t
block_work_id
=
get_block_1d_id
();
const
index_t
gemmn_block_data_on_global
=
block_work_id
*
GemmNPerBlock
;
// blockwise atomic accumulation
auto
blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v1
<
BlockSize
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
>
(
make_multi_index
(
0
,
gemmn_block_data_on_global
),
make_multi_index
(
0
,
gemmn_block_data_on_global
));
auto
col_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
0
));
auto
img_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
img_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
0
));
const
auto
col_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
const
auto
img_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step
(
img_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
-
GemmKPerBlock
;
gemmk
+=
GemmKPerBlock
)
{
blockwise_copy
.
Run
(
p_col_global
,
p_img_global
);
move_dynamic_tensor_coordinate
(
col_gemmk_gemmn_global_desc
,
col_gemmk_gemmn_coord
,
col_gemmk_gemmn_coord_step
);
move_dynamic_tensor_coordinate
(
img_gemmk_gemmn_global_desc
,
img_gemmk_gemmn_coord
,
img_gemmk_gemmn_coord_step
);
}
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/cluster_descriptor.hpp
0 → 100644
View file @
711701f3
#ifndef CK_CLUSTER_DESCRIPTOR_HPP
#define CK_CLUSTER_DESCRIPTOR_HPP
#include "common_header.hpp"
// TODO remove dependency on deprecated tensor descriptor
#include "tensor_descriptor.hpp"
namespace
ck
{
// a cluster map 1d index to N-d index
template
<
typename
Lengths
,
typename
ArrangeOrder
>
struct
ClusterDescriptor
{
static
constexpr
index_t
nDim
=
Lengths
::
Size
();
static
constexpr
auto
mDesc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Lengths
{}),
make_tuple
(
Merge
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
ArrangeOrder
{}))
>
{}),
make_tuple
(
ArrangeOrder
{}),
make_tuple
(
Sequence
<
0
>
{}));
__host__
__device__
constexpr
ClusterDescriptor
()
{
static_assert
(
Lengths
::
Size
()
==
nDim
&&
ArrangeOrder
::
Size
()
==
nDim
,
"wrong! size not the same"
);
static_assert
(
is_valid_sequence_map
<
ArrangeOrder
>
{},
"wrong! ArrangeOrder is wrong"
);
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
mDesc
.
GetElementSize
();
}
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
}
};
template
<
typename
Lengths
,
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
__host__
__device__
constexpr
auto
make_cluster_descriptor
(
Lengths
,
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
{
return
ClusterDescriptor
<
Lengths
,
decltype
(
order
)
>
{};
}
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
View file @
711701f3
...
@@ -21,9 +21,8 @@ __host__ __device__ constexpr auto
...
@@ -21,9 +21,8 @@ __host__ __device__ constexpr auto
make_dynamic_tensor_coordinate_step
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
);
make_dynamic_tensor_coordinate_step
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
);
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
__host__
__device__
void
move_dynamic_tensor_coordinate
(
const
TensorDesc
&
tensor_desc
,
__host__
__device__
constexpr
void
move_dynamic_tensor_coordinate
(
TensorCoord
&
coord
,
const
TensorDesc
&
tensor_desc
,
TensorCoord
&
coord
,
const
TensorCoordStep
&
coord_step
);
const
TensorCoordStep
&
coord_step
);
template
<
typename
TensorDesc
,
typename
TensorCoord
>
template
<
typename
TensorDesc
,
typename
TensorCoord
>
__host__
__device__
constexpr
bool
__host__
__device__
constexpr
bool
...
@@ -129,7 +128,7 @@ struct DynamicTensorDescriptor
...
@@ -129,7 +128,7 @@ struct DynamicTensorDescriptor
{
{
static_assert
(
Idx
::
Size
()
==
GetNumOfDimension
(),
"wrong! inconsistent # of dimension"
);
static_assert
(
Idx
::
Size
()
==
GetNumOfDimension
(),
"wrong! inconsistent # of dimension"
);
#if
0
// debug
#if
1
// debug
return
make_dynamic_tensor_coordinate
(
*
this
,
idx
).
GetOffset
();
return
make_dynamic_tensor_coordinate
(
*
this
,
idx
).
GetOffset
();
#else
#else
constexpr
index_t
ntransform
=
GetNumOfTransform
();
constexpr
index_t
ntransform
=
GetNumOfTransform
();
...
@@ -509,7 +508,7 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
...
@@ -509,7 +508,7 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
}
}
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
__host__
__device__
void
move_dynamic_tensor_coordinate
(
const
TensorDesc
&
tensor_desc
,
__host__
__device__
constexpr
void
move_dynamic_tensor_coordinate
(
const
TensorDesc
&
tensor_desc
,
TensorCoord
&
coord
,
TensorCoord
&
coord
,
const
TensorCoordStep
&
coord_step
)
const
TensorCoordStep
&
coord_step
)
{
{
...
@@ -608,5 +607,13 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc&
...
@@ -608,5 +607,13 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc&
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
tensor_desc
,
coord
);
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
tensor_desc
,
coord
);
}
}
template
<
typename
TensorDesc
>
using
DynamicTensorCoordinate_t
=
decltype
(
make_dynamic_tensor_coordinate
(
TensorDesc
{},
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
{}));
template
<
typename
TensorDesc
>
using
DynamicTensorCoordinateStep_t
=
decltype
(
make_dynamic_tensor_coordinate_step
(
TensorDesc
{},
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
{}));
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
711701f3
...
@@ -172,41 +172,5 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
...
@@ -172,41 +172,5 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
return
make_native_tensor_descriptor
(
new_lengths
,
new_strides
);
return
make_native_tensor_descriptor
(
new_lengths
,
new_strides
);
}
}
// a cluster map 1d index to N-d index
template
<
typename
Lengths
,
typename
ArrangeOrder
>
struct
ClusterDescriptor
{
static
constexpr
index_t
nDim
=
Lengths
::
Size
();
static
constexpr
auto
mDesc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Lengths
{}),
make_tuple
(
Merge
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
ArrangeOrder
{}))
>
{}),
make_tuple
(
ArrangeOrder
{}),
make_tuple
(
Sequence
<
0
>
{}));
__host__
__device__
constexpr
ClusterDescriptor
()
{
static_assert
(
Lengths
::
Size
()
==
nDim
&&
ArrangeOrder
::
Size
()
==
nDim
,
"wrong! size not the same"
);
static_assert
(
is_valid_sequence_map
<
ArrangeOrder
>
{},
"wrong! ArrangeOrder is wrong"
);
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
mDesc
.
GetElementSize
();
}
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
}
};
template
<
typename
Lengths
,
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
__host__
__device__
constexpr
auto
make_cluster_descriptor
(
Lengths
,
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
{
return
ClusterDescriptor
<
Lengths
,
decltype
(
order
)
>
{};
}
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
0 → 100644
View file @
711701f3
#ifndef CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#define CK_BLOCKWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockSrcDesc
,
typename
BlockDstDesc
,
typename
BlockSliceLengths
,
typename
ThreadSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectoReadDim
,
index_t
DstVectorWriteDim
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
struct
BlockwiseDynamicTensorSliceTransfer_v1
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v1
(
const
BlockSrcDesc
&
block_src_desc
,
const
Index
&
src_block_slice_origin
,
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
dst_block_slice_origin
)
{
static_assert
(
nDim
==
BlockSrcDesc
::
GetNumOfDimension
()
&&
nDim
==
BlockDstDesc
::
GetNumOfDimension
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong! BlockSize too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_id
=
thread_cluster_desc_
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
thread_data_id_begin
=
thread_cluster_id
*
ThreadSliceLengths
{};
threadwise_read_
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
threadwise_read_
.
SetDstSliceOrigin
(
make_zero_multi_index
<
nDim
>
());
threadwise_write_
.
SetSrcSliceOrigin
(
make_zero_multi_index
<
nDim
>
());
threadwise_write_
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
void
RunLoad
(
const
BlockSrcData
*
p_block_src
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_read_
.
Run
(
p_block_src
,
p_thread_buffer_
);
}
}
__device__
void
RunStore
(
BlockDstData
*
p_block_dst
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_write_
.
Run
(
p_thread_buffer_
,
p_block_dst
);
}
}
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
const
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
RunLoad
(
p_block_src
,
p_thread_buffer_
);
// if there is type conversion, it's done during store
RunStore
(
p_thread_buffer_
,
p_block_dst
);
}
}
__device__
void
MoveSrcSliceWindow
(
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_read_
.
MoveSrcSliceWindow
(
step
);
}
}
__device__
void
MoveDstSliceWindow
(
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_write_
.
MoveDstSliceWindow
(
step
);
}
}
private:
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
static
constexpr
auto
thread_buffer_desc_
=
make_dynamic_native_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
ThreadSliceLengths
{}));
using
ThreadwiseRead
=
ThreadwiseDynamicTensorSliceTransfer_v1
<
BlockSrcDesc
,
decltype
(
thread_buffer_desc_
),
ThreadSliceLengths
,
SrcDimAccessOrder
,
SrcVectoReadDim
,
SrcDataPerRead
,
1
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
SrcDataStride
,
1
>
;
using
ThreadwiseWrite
=
ThreadwiseDynamicTensorSliceTransfer_v1
<
decltype
(
thread_buffer_desc_
),
BlockDstDesc
,
ThreadSliceLengths
,
DstDimAccessOrder
,
DstVectorWriteDim
,
1
,
DstDataPerWrite
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstInMemOp
,
1
,
DstDataStride
>
;
ThreadwiseRead
threadwise_read_
;
ThreadwiseWrite
threadwise_write_
;
static
constexpr
index_t
thread_buffer_element_size_
=
thread_buffer_desc_
.
GetElementSpace
();
BlockSrcData
p_thread_buffer_
[
thread_buffer_element_size_
];
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
711701f3
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_coordinate.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
namespace
ck
{
namespace
ck
{
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
0 → 100644
View file @
711701f3
#ifndef CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#define CK_THREADWISE_DYNAMIC_TENSOR_SLICE_TRANSFER_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
namespace
ck
{
template
<
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDstDimAccessOrder
,
index_t
SrcDstVectorAccessDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
__host__
__device__
constexpr
void
threadwise_dynamic_tensor_slice_transfer_v1
(
const
SrcDesc
&
src_desc
,
const
DynamicTensorCoordinate_t
<
SrcDesc
>&
src_origin_coord
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
const
DynamicTensorCoordinate_t
<
DstDesc
>&
dst_origin_coord
,
DstData
*
p_dst
)
{
auto
src_coord
=
src_origin_coord
;
auto
dst_coord
=
dst_origin_coord
;
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
J0
=
SliceLengths
{}[
0
];
constexpr
index_t
J1
=
SliceLengths
{}[
1
];
bool
forward_dim0
=
true
;
bool
forward_dim1
=
true
;
// hardcoded for 2d loop for now
#pragma unroll 1
for
(
int
j0
=
0
;
j0
<
J0
;
++
j0
)
{
#pragma unroll 1
for
(
int
j1
=
0
;
j1
<
J1
;
++
j1
)
{
// do work
p_dst
[
dst_coord
.
GetOffset
()]
=
p_src
[
src_coord
.
GetOffset
()];
// move dim1 iterator
if
(
j1
<
J1
-
1
)
{
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_m1
);
}
}
}
// switch dim1 iteration direction
forward_dim1
=
!
forward_dim1
;
// move dim0 iterator
if
(
j0
<
J0
-
1
)
{
if
(
forward_dim0
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_m1_0
);
}
}
}
}
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDstDimAccessOrder
,
index_t
SrcDstVectorAccessDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
struct
ThreadwiseDynamicTensorSliceTransfer_v1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1
()
=
default
;
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
:
src_desc_
(
src_desc
),
src_slice_origin_
(
src_slice_origin
),
dst_desc_
(
dst_desc
),
dst_slice_origin_
(
dst_slice_origin
)
{
}
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
threadwise_dynamic_tensor_slice_transfer_v1
<
SrcData
,
DstData
,
SrcDesc
,
DstDesc
,
SliceLengths
,
SrcDstDimAccessOrder
,
SrcDstVectorAccessDim
,
SrcScalarPerVector
,
DstScalarPerVector
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
src_desc_
,
src_slice_origin_
,
p_src
,
dst_desc_
,
dst_slice_origin_
,
p_dst
);
}
__device__
void
SetSrcSliceOrigin
(
const
Index
&
src_slice_origin_idx
)
{
src_slice_origin_
=
make_dynamic_tensor_coordinate
(
src_desc_
,
src_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
Index
&
dst_slice_origin_idx
)
{
dst_slice_origin_
=
make_dynamic_tensor_coordinate
(
dst_desc_
,
dst_slice_origin_idx
);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
Index
&
src_slice_origin_step_idx
)
{
const
auto
src_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
src_desc_
,
src_slice_origin_step_idx
);
move_dynamic_tensor_coordinate
(
src_desc_
,
src_slice_origin_
,
src_slice_origin_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
Index
&
dst_slice_origin_step_idx
)
{
const
auto
dst_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc_
,
dst_slice_origin_step_idx
);
move_dynamic_tensor_coordinate
(
dst_desc_
,
dst_slice_origin_
,
dst_slice_origin_step
);
}
private:
const
SrcDesc
&
src_desc_
;
const
DstDesc
&
dst_desc_
;
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
}
// namespace ck
#endif
driver/include/device_dynamic_col2im_eb_nchw.hpp
deleted
100644 → 0
View file @
176d9fc0
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dynamic_gridwise_col2im_eb_nchw.hpp"
template
<
typename
T
,
typename
ColDesc
,
typename
ImgDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
>
void
device_dynamic_col2im_eb_nchw
(
ColDesc
,
const
Tensor
<
T
>&
col_eb
,
ImgDesc
,
Tensor
<
T
>&
img_nchw
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
,
std
::
size_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
col_eb_desc
=
ColDesc
{};
constexpr
auto
img_nchw_desc
=
ImgDesc
{};
constexpr
index_t
N
=
img_nchw_desc
.
GetLengths
()[
0
];
constexpr
index_t
C
=
img_nchw_desc
.
GetLengths
()[
1
];
constexpr
index_t
Hi
=
img_nchw_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wi
=
img_nchw_desc
.
GetLengths
()[
3
];
constexpr
index_t
E
=
col_eb_desc
.
GetLengths
()[
0
];
constexpr
index_t
B
=
col_eb_desc
.
GetLengths
()[
1
];
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
col_eb_device_buf
(
data_sz
*
col_eb
.
mDesc
.
GetElementSpace
());
DeviceMem
img_nchw_device_buf
(
data_sz
*
img_nchw
.
mDesc
.
GetElementSpace
());
col_eb_device_buf
.
ToDevice
(
col_eb
.
mData
.
data
());
img_nchw_device_buf
.
ToDevice
(
img_nchw
.
mData
.
data
());
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
EPerBlock
=
128
;
constexpr
index_t
BPerBlock
=
128
;
using
BlockCopySubLengths_E_B
=
Sequence
<
8
,
8
>
;
using
BlockCopyClusterLengths_E_B
=
Sequence
<
16
,
16
>
;
using
BlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
BlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
BlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
BlockCopyDataPerAccess_B
=
1
;
#endif
constexpr
index_t
GridSize
=
((
E
+
EPerBlock
-
1
)
/
EPerBlock
)
*
((
B
+
BPerBlock
-
1
)
/
BPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_col2im
=
DynamicGridwiseCol2Im_eb_nchw
<
GridSize
,
BlockSize
,
T
,
ColDesc
,
ImgDesc
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
,
EPerBlock
,
BPerBlock
,
BlockCopySubLengths_E_B
,
BlockCopyClusterLengths_E_B
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
BlockCopyDataPerAccess_B
>
{};
for
(
index_t
i
=
0
;
i
<
1
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
decltype
(
gridwise_col2im
),
const
T
*
const
__restrict__
,
T
*
const
__restrict__
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
const_cast
<
const
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
col_eb_device_buf
.
GetDeviceBuffer
())),
const_cast
<
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
img_nchw_device_buf
.
GetDeviceBuffer
())));
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
}
img_nchw_device_buf
.
FromDevice
(
img_nchw
.
mData
.
data
());
}
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
0 → 100644
View file @
711701f3
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp"
template
<
typename
T
,
typename
ColDesc
,
typename
ImgDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_dynamic_col2im_gemmkgemmn_nchw
(
ColDesc
,
const
Tensor
<
T
>&
col_gemmk_gemmn
,
ImgDesc
,
Tensor
<
T
>&
img_n_c_hi_wi
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
std
::
size_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
col_gemmk_gemmn_device_buf
(
data_sz
*
col_gemmk_gemmn
.
mDesc
.
GetElementSpace
());
DeviceMem
img_n_c_hi_wi_device_buf
(
data_sz
*
img_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
col_gemmk_gemmn_device_buf
.
ToDevice
(
col_gemmk_gemmn
.
mData
.
data
());
img_n_c_hi_wi_device_buf
.
ToDevice
(
img_n_c_hi_wi
.
mData
.
data
());
const
auto
col_gemmk_gemmn_desc
=
make_dynamic_native_tensor_descriptor
<
2
>
(
to_multi_index
(
ColDesc
::
GetLengths
()),
to_multi_index
(
ColDesc
::
GetStrides
()));
const
auto
img_n_c_hi_wi_desc
=
make_dynamic_native_tensor_descriptor
<
4
>
(
to_multi_index
(
ImgDesc
::
GetLengths
()),
to_multi_index
(
ImgDesc
::
GetStrides
()));
const
auto
filter_sizes
=
to_multi_index
(
FilterSizes
{});
const
auto
out_sizes
=
to_multi_index
(
OutputSizes
{});
const
auto
conv_strides
=
to_multi_index
(
ConvStrides
{});
const
auto
conv_dilations
=
to_multi_index
(
ConvDilations
{});
const
auto
in_left_pads
=
to_multi_index
(
InLeftPads
{});
const
auto
in_right_pads
=
to_multi_index
(
InRightPads
{});
const
auto
img_gemmk_gemmn_desc
=
map_img_into_col
(
img_n_c_hi_wi_desc
,
out_sizes
,
filter_sizes
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
);
const
index_t
GemmN
=
col_gemmk_gemmn_desc
.
GetLength
(
I1
);
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmKPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
using
BlockCopySubLengths_GemmK_GemmN
=
Sequence
<
8
,
8
>
;
using
BlockCopyClusterLengths_GemmK_GemmN
=
Sequence
<
16
,
16
>
;
using
BlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
constexpr
index_t
BlockCopyDataPerAccess_GemmN
=
1
;
#endif
const
index_t
GridSize
=
GemmN
/
GemmNPerBlock
;
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_col2im
=
DynamicGridwiseCol2Im_gemmkgemmn_nchw
<
BlockSize
,
GemmKPerBlock
,
GemmNPerBlock
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
BlockCopyDataPerAccess_GemmN
>
{};
for
(
index_t
i
=
0
;
i
<
1
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
decltype
(
gridwise_col2im
),
const
T
*
const
__restrict__
,
T
*
const
__restrict__
,
decltype
(
col_gemmk_gemmn_desc
),
decltype
(
img_gemmk_gemmn_desc
)
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
const_cast
<
const
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
col_gemmk_gemmn_device_buf
.
GetDeviceBuffer
())),
const_cast
<
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
img_n_c_hi_wi_device_buf
.
GetDeviceBuffer
())),
col_gemmk_gemmn_desc
,
img_gemmk_gemmn_desc
);
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
}
img_n_c_hi_wi_device_buf
.
FromDevice
(
img_n_c_hi_wi
.
mData
.
data
());
}
driver/src/col2im_driver.cpp
View file @
711701f3
...
@@ -12,45 +12,30 @@
...
@@ -12,45 +12,30 @@
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "host_col2im.hpp"
#include "host_col2im.hpp"
#include "device_col2im_eb_nchw.hpp"
#include "device_col2im_eb_nchw.hpp"
#include "device_dynamic_col2im_
eb
_nchw.hpp"
#include "device_dynamic_col2im_
gemmkgemmn
_nchw.hpp"
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if 1
#if 0
constexpr
index_t
N
=
2
;
// 3x3, 71x71
constexpr
index_t
C
=
8
;
constexpr index_t N = 128;
constexpr
index_t
HI
=
8
;
constexpr index_t C = 192;
constexpr
index_t
WI
=
8
;
constexpr index_t HI = 71;
constexpr
index_t
K
=
128
;
constexpr index_t WI = 71;
constexpr
index_t
Y
=
4
;
constexpr
index_t
X
=
4
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
#elif 0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr index_t K = 128;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t Y = 3;
constexpr index_t X = 3;
constexpr index_t X = 3;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using ConvStrides = Sequence<
2
,
2
>;
using ConvDilations = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using LeftPads = Sequence<
1
,
1
>;
using
RightPads
=
Sequence
<
0
,
0
>
;
using RightPads = Sequence<
1
,
1
>;
#elif
0
#elif
0
// 1x1 filter, 8x8 image
// 1x1, 8x8
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
WI
=
8
;
...
@@ -64,13 +49,12 @@ int main(int argc, char* argv[])
...
@@ -64,13 +49,12 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 8x8 image
// 1x1, 73x73
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
2048
;
constexpr
index_t
C
=
160
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
38
4
;
constexpr
index_t
K
=
6
4
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
...
@@ -80,31 +64,104 @@ int main(int argc, char* argv[])
...
@@ -80,31 +64,104 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 7x7 image
// 3x3, 35x35
// cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
C
=
96
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
384
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
71
;
constexpr
index_t
WI
=
71
;
constexpr
index_t
K
=
192
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 7x1, 17x17
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif 0
// 1x7, 17x17
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
#elif 0
// 3x3, 299x299 stride=2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
3
;
constexpr
index_t
HI
=
299
;
constexpr
index_t
WI
=
299
;
constexpr
index_t
K
=
32
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 8x8 image
// 3x3, 147x147
// cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1280
;
constexpr
index_t
C
=
32
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
HI
=
147
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
WI
=
147
;
constexpr
index_t
K
=
384
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 149x149
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
32
;
constexpr
index_t
HI
=
149
;
constexpr
index_t
WI
=
149
;
constexpr
index_t
K
=
32
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
...
@@ -112,77 +169,147 @@ int main(int argc, char* argv[])
...
@@ -112,77 +169,147 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 14x14 image
// 3x3, 17x17, stride 2
// cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
192
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 35x35
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
384
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 1
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 8x8 image
// 1x3, 8x8
// cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
384
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
K
=
448
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
1
>
;
using
RightPads
=
Sequence
<
0
,
1
>
;
#elif 0
// 3x1, 8x8
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
448
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
0
>
;
using
RightPads
=
Sequence
<
1
,
0
>
;
#elif 0
// 3x3, 147x147
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
147
;
constexpr
index_t
WI
=
147
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 28x28 image
// 7x1, 73x73
// cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
25
6
;
constexpr
index_t
C
=
6
4
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif 0
// 3x3, 73x73
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
96
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 7x7 image
// 1x1, 14x14, stride 2
// cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
2
56
;
constexpr
index_t
K
=
2
048
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 17x17 input
// 1x1, 14x14
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
768
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
1
7
;
constexpr
index_t
HI
=
1
4
;
constexpr
index_t
WI
=
1
7
;
constexpr
index_t
WI
=
1
4
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
...
@@ -192,63 +319,104 @@ int main(int argc, char* argv[])
...
@@ -192,63 +319,104 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 14x14 image
// 1x1, 14x14, stride 2
// cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
12
8
;
constexpr
index_t
K
=
5
12
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 1
// 3x3, 28x28
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
#elif 0
// 1x1 filter, 14x14 image
// 3x3, 14x14
// cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
256
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 1x1, 56x56, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 7x7 image
// 7x7, 230x230 stride=2
// cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
C
=
3
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
HI
=
230
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
WI
=
230
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
7
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1, 28x28, stride = 2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 1x1, 28x28, stride 2
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
288
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
384
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
...
@@ -256,50 +424,65 @@ int main(int argc, char* argv[])
...
@@ -256,50 +424,65 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
//
5x5 filter, 2x2 pad, 7x7 input
//
1x1, 7x7
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
48
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
1
28
;
constexpr
index_t
K
=
2
04
8
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
5
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
2
,
2
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
//
7x1 filter, 3x0 pad
,
1
7x
17 input
//
3x3
, 7x
7
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
12
8
;
constexpr
index_t
C
=
5
12
;
constexpr
index_t
HI
=
1
7
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
1
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
12
8
;
constexpr
index_t
K
=
5
12
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif
1
#elif
0
// 1x
7 filter, 0x3 pad, 17x17 input
// 1x
1, 56x56
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 56x56
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
K
=
64
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#endif
#endif
constexpr
auto
img_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
img_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
...
@@ -362,8 +545,8 @@ int main(int argc, char* argv[])
...
@@ -362,8 +545,8 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#el
se
#el
if
0
device_dynamic_col2im_
eb
_nchw
(
col_eb_desc
,
device_dynamic_col2im_
gemmkgemmn
_nchw
(
col_eb_desc
,
col_eb
,
col_eb
,
img_nchw_desc
,
img_nchw_desc
,
img_nchw_device
,
img_nchw_device
,
...
@@ -376,6 +559,34 @@ int main(int argc, char* argv[])
...
@@ -376,6 +559,34 @@ int main(int argc, char* argv[])
nrepeat
);
nrepeat
);
#endif
#endif
#if 0
constexpr auto lengths = to_multi_index(Sequence<1,2>{});
constexpr auto thread_buffer_desc_ =
make_dynamic_native_tensor_descriptor_packed<2>(lengths);
#elif
1
constexpr
auto
lengths
=
to_multi_index
(
Sequence
<
1
,
2
>
{});
constexpr
index_t
NDim
=
2
;
constexpr
auto
transforms
=
make_tuple
(
DynamicUnMerge
<
NDim
>
{
lengths
});
constexpr
auto
low_dim_hidden_idss
=
make_tuple
(
Sequence
<
0
>
{});
constexpr
auto
up_dim_hidden_idss
=
make_tuple
(
typename
arithmetic_sequence_gen
<
1
,
NDim
+
1
,
1
>::
type
{});
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
1
,
NDim
+
1
,
1
>::
type
{};
constexpr
index_t
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
<
index_t
>
{},
index_t
{
1
});
constexpr
auto
desc
=
DynamicTensorDescriptor
<
decltype
(
transforms
),
decltype
(
low_dim_hidden_idss
),
decltype
(
up_dim_hidden_idss
),
decltype
(
visible_dim_hidden_ids
)
>
{};
//{transforms, element_space_size};
#endif
if
(
do_verification
)
if
(
do_verification
)
{
{
host_col2im
(
col_eb
,
host_col2im
(
col_eb
,
...
...
driver/src/conv_driver.cpp
View file @
711701f3
...
@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
...
@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
#endif
#endif
}
}
#if
1
#if
0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
in_nchw,
wei_kcyx_desc,
wei_kcyx_desc,
...
@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
...
@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -598,7 +598,7 @@ int main(int argc, char* argv[])
...
@@ -598,7 +598,7 @@ int main(int argc, char* argv[])
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif 1
#elif 1
device_dummy_dynamic_transform
_v2
(
in_nchw_desc
,
device_dummy_dynamic_transform
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment