Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6ed96265
Commit
6ed96265
authored
Nov 17, 2019
by
Chao Liu
Browse files
added col2im
parent
ad3ac5cc
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
379 additions
and
103 deletions
+379
-103
composable_kernel/include/gridwise_operation_wrapper.hpp
composable_kernel/include/gridwise_operation_wrapper.hpp
+10
-0
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
...rnel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
+126
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+10
-0
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+22
-13
driver/include/device_col2im_eb_nchw.hpp
driver/include/device_col2im_eb_nchw.hpp
+108
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+12
-5
driver/include/host_col2im.hpp
driver/include/host_col2im.hpp
+2
-2
driver/include/tensor_generator.hpp
driver/include/tensor_generator.hpp
+3
-1
driver/src/col2im_driver.cpp
driver/src/col2im_driver.cpp
+39
-38
driver/src/col2im_driver.cu
driver/src/col2im_driver.cu
+43
-40
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+4
-4
No files found.
composable_kernel/include/gridwise_operation_wrapper.hpp
0 → 100644
View file @
6ed96265
#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
template
<
typename
GridwiseOp
,
typename
...
Xs
>
__global__
void
run_gridwise_operation
(
GridwiseOp
&
gridwise_op
,
Xs
...
xs
)
{
gridwise_op
.
Run
(
xs
...);
}
#endif
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
0 → 100644
View file @
6ed96265
#ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#define CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
namespace
ck
{
// B = merge(N, Ho, Wo)
template
<
index_t
GridSize
,
index_t
BlockSize
,
typename
Float
,
typename
ColGlobalDesc
,
typename
ImgGlobalDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
,
index_t
EPerBlock
,
index_t
BPerBlock
,
typename
BlockCopySubLengths_E_B
,
typename
BlockCopyClusterLengths_E_B
,
typename
BlockCopyThreadClusterArrangeOrder
,
typename
BlockCopySrcAccessOrder
,
typename
BlockCopyDstAccessOrder
,
index_t
BlockCopyDataPerAccess_B
>
struct
GridwiseCol2Im_eb_nchw
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_col_global
,
Float
*
const
__restrict__
p_img_global
)
const
{
constexpr
auto
col_e_b_global_desc
=
ColGlobalDesc
{};
constexpr
auto
img_n_c_hi_wi_global_desc
=
ImgGlobalDesc
{};
constexpr
index_t
N
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
0
];
constexpr
index_t
C
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
1
];
constexpr
index_t
Hi
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wi
=
img_n_c_hi_wi_global_desc
.
GetLengths
()[
3
];
constexpr
index_t
Ho
=
OutputSizes
{}[
0
];
constexpr
index_t
Wo
=
OutputSizes
{}[
1
];
constexpr
index_t
Y
=
FilterSizes
{}[
0
];
constexpr
index_t
X
=
FilterSizes
{}[
1
];
constexpr
index_t
ConvStrideH
=
ConvStrides
{}[
0
];
constexpr
index_t
ConvStrideW
=
ConvStrides
{}[
1
];
constexpr
index_t
ConvDilationH
=
ConvDilations
{}[
0
];
constexpr
index_t
ConvDilationW
=
ConvDilations
{}[
1
];
constexpr
index_t
E
=
C
*
Y
*
X
;
constexpr
index_t
B
=
N
*
Ho
*
Wo
;
// sanity-check for vectorized memory load
static_assert
((
Wo
==
1
||
(
ConvStrideW
==
1
||
BlockCopyDataPerAccess_B
==
1
))
&&
(
X
==
1
||
ConvDilationW
%
BlockCopyDataPerAccess_B
==
0
),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"
);
// divide block work by [E, B]
static_assert
(
E
%
EPerBlock
==
0
&&
B
%
BPerBlock
==
0
,
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
EBlockWork
=
E
/
EPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
EBlockWork
,
BBlockWork
>
{});
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
const
index_t
e_block_data_on_global
=
block_work_id
[
0
]
*
EPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
1
]
*
BPerBlock
;
// construct img_eb_global_desc
constexpr
auto
img_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
img_n_c_hi_wi_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}));
constexpr
auto
img_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
img_n_c_hip_wip_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Embed
<
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
img_e_b_global_desc
=
transform_tensor_descriptor
(
img_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// blockwise atomic accumulation
auto
blockwise_copy
=
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
decltype
(
col_e_b_global_desc
),
decltype
(
img_e_b_global_desc
),
Sequence
<
EPerBlock
,
BPerBlock
>
,
BlockCopySubLengths_E_B
,
BlockCopyClusterLengths_E_B
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_B
,
BlockCopyDataPerAccess_B
>
(
{
e_block_data_on_global
,
b_block_data_on_global
},
{
e_block_data_on_global
,
b_block_data_on_global
});
// blockwise copy
blockwise_copy
.
Run
(
p_col_global
,
p_img_global
);
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
6ed96265
...
...
@@ -134,8 +134,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
}
else
{
#if 0 // debug
mThreadwiseStore.Run(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
#else
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
mThreadwiseStore
.
Run
(
p_thread_buffer
,
p_block_dst
,
thread_buffer_address_space
,
block_dst_address_space
,
True
);
#endif
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
6ed96265
...
...
@@ -69,11 +69,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
template
<
typename
SrcData
,
typename
DstData
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
>
AddressSpace
DstAddressSpace
,
bool
DoAtomicAdd
=
false
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
,
integral_constant
<
AddressSpace
,
SrcAddressSpace
>
,
integral_constant
<
AddressSpace
,
DstAddressSpace
>
)
const
integral_constant
<
AddressSpace
,
DstAddressSpace
>
,
integral_constant
<
bool
,
DoAtomicAdd
>
do_atomic_add
=
integral_constant
<
bool
,
DoAtomicAdd
>
{})
const
{
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstDataPerAccess
>::
MemoryType
;
...
...
@@ -160,21 +163,27 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
static_if
<!
DoAtomicAdd
>
{}([
&
](
auto
)
{
static_if
<
DstAddressSpace
==
AddressSpace
::
global
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store
<
DstData
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
fwd
(
p_dst
),
dst_coord
.
GetOffset
(),
0
);
amd_intrinsic_buffer_store
<
DstData
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]),
fwd
(
p_dst
),
dst_coord
.
GetOffset
(),
0
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
atomicAdd
(
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
,
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
])
)
;
});
}
}
...
...
driver/include/device_col2im_eb_nchw.hpp
0 → 100644
View file @
6ed96265
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_col2im_eb_nchw.hpp"
template
<
typename
T
,
typename
ColDesc
,
typename
ImgDesc
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
>
void
device_col2im_eb_nchw
(
ColDesc
,
const
Tensor
<
T
>&
col_eb
,
ImgDesc
,
Tensor
<
T
>&
img_nchw
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
,
std
::
size_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
col_eb_desc
=
ColDesc
{};
constexpr
auto
img_nchw_desc
=
ImgDesc
{};
constexpr
index_t
N
=
img_nchw_desc
.
GetLengths
()[
0
];
constexpr
index_t
C
=
img_nchw_desc
.
GetLengths
()[
1
];
constexpr
index_t
Hi
=
img_nchw_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wi
=
img_nchw_desc
.
GetLengths
()[
3
];
constexpr
index_t
E
=
col_eb_desc
.
GetLengths
()[
0
];
constexpr
index_t
B
=
col_eb_desc
.
GetLengths
()[
1
];
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
col_eb_device_buf
(
data_sz
*
col_eb
.
mDesc
.
GetElementSpace
());
DeviceMem
img_nchw_device_buf
(
data_sz
*
img_nchw
.
mDesc
.
GetElementSpace
());
col_eb_device_buf
.
ToDevice
(
col_eb
.
mData
.
data
());
img_nchw_device_buf
.
ToDevice
(
img_nchw
.
mData
.
data
());
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
EPerBlock
=
128
;
constexpr
index_t
BPerBlock
=
128
;
using
BlockCopySubLengths_E_B
=
Sequence
<
8
,
8
>
;
using
BlockCopyClusterLengths_E_B
=
Sequence
<
16
,
16
>
;
using
BlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
BlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
BlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
BlockCopyDataPerAccess_B
=
1
;
#endif
constexpr
index_t
GridSize
=
((
E
+
EPerBlock
-
1
)
/
EPerBlock
)
*
((
B
+
BPerBlock
-
1
)
/
BPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_col2im
=
GridwiseCol2Im_eb_nchw
<
GridSize
,
BlockSize
,
T
,
ColDesc
,
ImgDesc
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
,
EPerBlock
,
BPerBlock
,
BlockCopySubLengths_E_B
,
BlockCopyClusterLengths_E_B
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
BlockCopyDataPerAccess_B
>
{};
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
run_gridwise_operation
<
decltype
(
gridwise_col2im
),
const
T
*
const
__restrict__
,
T
*
const
__restrict__
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
gridwise_col2im
,
const_cast
<
const
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
col_eb_device_buf
.
GetDeviceBuffer
())),
const_cast
<
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
img_nchw_device_buf
.
GetDeviceBuffer
())));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
img_nchw_device_buf
.
FromDevice
(
img_nchw
.
mData
.
data
());
}
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
6ed96265
...
...
@@ -2,7 +2,7 @@
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_
convolution_kernel
_wrapper.hpp"
#include "gridwise_
operation
_wrapper.hpp"
#include "convolution_common.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
...
...
@@ -221,13 +221,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_operation
<
decltype
(
gridwise_conv
),
const
T
*
const
__restrict__
,
const
T
*
const
__restrict__
,
T
*
const
__restrict__
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
gridwise_conv
,
const_cast
<
const
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
())),
const_cast
<
const
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
())),
const_cast
<
T
*
const
__restrict__
>
(
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
())));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
...
...
driver/include/host_col2im.hpp
View file @
6ed96265
...
...
@@ -37,7 +37,7 @@ void host_col2im(const Tensor<T>& in_eb,
{
int
h_tmp
=
hi
+
LeftPads
{}[
0
]
-
y
*
ConvDilations
{}[
0
];
if
(
h_tmp
%
ConvStrides
{}[
0
]
==
0
)
if
(
h_tmp
>=
0
&&
h_tmp
<
HI
&&
h_tmp
%
ConvStrides
{}[
0
]
==
0
)
{
int
ho
=
h_tmp
/
ConvStrides
{}[
0
];
...
...
@@ -45,7 +45,7 @@ void host_col2im(const Tensor<T>& in_eb,
{
int
w_tmp
=
wi
+
LeftPads
{}[
1
]
-
x
*
ConvDilations
{}[
1
];
if
(
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
if
(
w_tmp
>=
0
&&
w_tmp
<
WI
&&
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
{
int
wo
=
w_tmp
/
ConvStrides
{}[
1
];
...
...
driver/include/tensor_generator.hpp
View file @
6ed96265
...
...
@@ -5,10 +5,12 @@
struct
GeneratorTensor_1
{
int
value
=
1
;
template
<
class
...
Is
>
double
operator
()(
Is
...
is
)
{
return
1
;
return
value
;
}
};
...
...
driver/src/col2im_driver.cpp
View file @
6ed96265
...
...
@@ -13,26 +13,26 @@
#include "device_tensor.hpp"
#include "conv_common.hpp"
#include "host_col2im.hpp"
//
#include "device_col2im.hpp"
#include "device_col2im
_eb_nchw
.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
#if 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
1
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
1
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
N
=
2
;
constexpr
index_t
C
=
8
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
1
28
;
constexpr
index_t
Y
=
4
;
constexpr
index_t
X
=
4
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
#elif 0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
...
...
@@ -303,21 +303,22 @@ int main(int argc, char* argv[])
using
RightPads
=
Sequence
<
0
,
3
>
;
#endif
constexpr
auto
i
n
_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
i
mg
_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
i
n
_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
i
mg
_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
constexpr
index_t
HO
=
out_nkhw_desc
.
GetLengths
()[
2
];
constexpr
index_t
WO
=
out_nkhw_desc
.
GetLengths
()[
3
];
auto
in_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
constexpr
auto
col_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
using
FilterSizes
=
Sequence
<
Y
,
X
>
;
using
OutputSizes
=
Sequence
<
HO
,
WO
>
;
ostream_ConstantTensorDescriptor
(
in_nchw
_desc
,
std
::
cout
<<
"
in_nchw
_desc: "
);
ostream_ConstantTensorDescriptor
(
i
n_eb
_desc
,
std
::
cout
<<
"i
n_eb
_desc: "
);
ostream_ConstantTensorDescriptor
(
col_eb
_desc
,
std
::
cout
<<
"
col_eb
_desc: "
);
ostream_ConstantTensorDescriptor
(
i
mg_nchw
_desc
,
std
::
cout
<<
"i
mg_nchw
_desc: "
);
print_sequence
(
"FilterSizes"
,
FilterSizes
{});
print_sequence
(
"OutputSizes"
,
OutputSizes
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
...
...
@@ -326,9 +327,9 @@ int main(int argc, char* argv[])
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
Tensor
<
float
>
in
_eb
(
make_TensorDescriptor
(
in
_eb_desc
));
Tensor
<
float
>
i
n
_nchw_host
(
make_TensorDescriptor
(
i
n
_nchw_desc
));
Tensor
<
float
>
i
n
_nchw_device
(
make_TensorDescriptor
(
i
n
_nchw_desc
));
Tensor
<
float
>
col
_eb
(
make_TensorDescriptor
(
col
_eb_desc
));
Tensor
<
float
>
i
mg
_nchw_host
(
make_TensorDescriptor
(
i
mg
_nchw_desc
));
Tensor
<
float
>
i
mg
_nchw_device
(
make_TensorDescriptor
(
i
mg
_nchw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
...
@@ -339,35 +340,35 @@ int main(int argc, char* argv[])
}
bool
do_verification
=
atoi
(
argv
[
1
]);
index
_t
nrepeat
=
atoi
(
argv
[
2
]);
std
::
size
_t
nrepeat
=
atoi
(
argv
[
2
]);
if
(
do_verification
)
{
#if 1
in
_eb
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
col
_eb
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#else
in
_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
col
_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
}
#if
0
device_col2im
(in
_eb_desc,
in
_eb,
in
_nchw_desc,
in
_nchw_device,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#if
1
device_col2im
_eb_nchw
(
col
_eb_desc
,
col
_eb
,
img
_nchw_desc
,
img
_nchw_device
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#endif
if
(
do_verification
)
{
host_col2im
(
in
_eb
,
i
n
_nchw_host
,
host_col2im
(
col
_eb
,
i
mg
_nchw_host
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
...
...
@@ -375,12 +376,12 @@ int main(int argc, char* argv[])
LeftPads
{},
RightPads
{});
check_error
(
i
n
_nchw_host
,
i
n
_nchw_device
);
check_error
(
i
mg
_nchw_host
,
i
mg
_nchw_device
);
#if 1
LogRange
(
std
::
cout
<<
"
in
_eb : "
,
in
_eb
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"i
n
_nchw_host : "
,
i
n
_nchw_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"i
n
_nchw_device : "
,
i
n
_nchw_device
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"
col
_eb : "
,
col
_eb
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"i
mg
_nchw_host : "
,
i
mg
_nchw_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"i
mg
_nchw_device : "
,
i
mg
_nchw_device
.
mData
,
","
)
<<
std
::
endl
;
#endif
}
}
driver/src/col2im_driver.cu
View file @
6ed96265
...
...
@@ -13,26 +13,26 @@
#include "device_tensor.hpp"
#include "conv_common.hpp"
#include "host_col2im.hpp"
//
#include "device_col2im.hpp"
#include "device_col2im
_eb_nchw
.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
#if 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
1
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
1
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
N
=
2
;
constexpr
index_t
C
=
8
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
1
28
;
constexpr
index_t
Y
=
4
;
constexpr
index_t
X
=
4
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
#elif 0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
...
...
@@ -303,21 +303,22 @@ int main(int argc, char* argv[])
using
RightPads
=
Sequence
<
0
,
3
>
;
#endif
constexpr
auto
i
n
_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
i
mg
_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
i
n
_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
i
mg
_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
constexpr
index_t
HO
=
out_nkhw_desc
.
GetLengths
()[
2
];
constexpr
index_t
WO
=
out_nkhw_desc
.
GetLengths
()[
3
];
auto
in_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
constexpr
auto
col_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
using
FilterSizes
=
Sequence
<
Y
,
X
>
;
using
OutputSizes
=
Sequence
<
HO
,
WO
>
;
ostream_ConstantTensorDescriptor
(
in_nchw
_desc
,
std
::
cout
<<
"
in_nchw
_desc: "
);
ostream_ConstantTensorDescriptor
(
i
n_eb
_desc
,
std
::
cout
<<
"i
n_eb
_desc: "
);
ostream_ConstantTensorDescriptor
(
col_eb
_desc
,
std
::
cout
<<
"
col_eb
_desc: "
);
ostream_ConstantTensorDescriptor
(
i
mg_nchw
_desc
,
std
::
cout
<<
"i
mg_nchw
_desc: "
);
print_sequence
(
"FilterSizes"
,
FilterSizes
{});
print_sequence
(
"OutputSizes"
,
OutputSizes
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
...
...
@@ -326,9 +327,9 @@ int main(int argc, char* argv[])
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
Tensor
<
float
>
in
_eb
(
make_TensorDescriptor
(
in
_eb_desc
));
Tensor
<
float
>
i
n
_nchw_host
(
make_TensorDescriptor
(
i
n
_nchw_desc
));
Tensor
<
float
>
i
n
_nchw_device
(
make_TensorDescriptor
(
i
n
_nchw_desc
));
Tensor
<
float
>
col
_eb
(
make_TensorDescriptor
(
col
_eb_desc
));
Tensor
<
float
>
i
mg
_nchw_host
(
make_TensorDescriptor
(
i
mg
_nchw_desc
));
Tensor
<
float
>
i
mg
_nchw_device
(
make_TensorDescriptor
(
i
mg
_nchw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
...
@@ -339,35 +340,37 @@ int main(int argc, char* argv[])
}
bool
do_verification
=
atoi
(
argv
[
1
]);
index
_t
nrepeat
=
atoi
(
argv
[
2
]);
std
::
size
_t
nrepeat
=
atoi
(
argv
[
2
]);
if
(
do_verification
)
{
#if 1
in_eb
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#if 0
col_eb.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread);
#else
in_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
col_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
img_nchw_device
.
GenerateTensorValue
(
GeneratorTensor_1
{
0
},
num_thread
);
#endif
}
#if
0
device_col2im
(in
_eb_desc,
in
_eb,
in
_nchw_desc,
in
_nchw_device,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#if
1
device_col2im
_eb_nchw
(
col
_eb_desc
,
col
_eb
,
img
_nchw_desc
,
img
_nchw_device
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#endif
if
(
do_verification
)
{
host_col2im
(
in
_eb
,
i
n
_nchw_host
,
host_col2im
(
col
_eb
,
i
mg
_nchw_host
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
...
...
@@ -375,12 +378,12 @@ int main(int argc, char* argv[])
LeftPads
{},
RightPads
{});
check_error
(
i
n
_nchw_host
,
i
n
_nchw_device
);
check_error
(
i
mg
_nchw_host
,
i
mg
_nchw_device
);
#if
1
LogRange
(
std
::
cout
<<
"
in
_eb : "
,
in
_eb
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"i
n
_nchw_host : "
,
i
n
_nchw_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"i
n
_nchw_device : "
,
i
n
_nchw_device
.
mData
,
","
)
<<
std
::
endl
;
#if
0
LogRange(std::cout << "
col
_eb : ",
col
_eb.mData, ",") << std::endl;
LogRange(std::cout << "i
mg
_nchw_host : ", i
mg
_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "i
mg
_nchw_device : ", i
mg
_nchw_device.mData, ",") << std::endl;
#endif
}
}
driver/src/conv_driver.cpp
View file @
6ed96265
...
...
@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
{
using
namespace
ck
;
#if
1
#if
0
constexpr index_t N = 8;
constexpr index_t C = 32;
constexpr index_t HI = 28;
...
...
@@ -393,7 +393,7 @@ int main(int argc, char* argv[])
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
@@ -403,7 +403,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
@@ -445,7 +445,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment