Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
506a823a
Commit
506a823a
authored
May 30, 2020
by
Chao Liu
Browse files
clean up
parent
80901f59
Changes
55
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
6094 additions
and
5897 deletions
+6094
-5897
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+44
-43
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
+0
-220
driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
+0
-178
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+9
-6
driver/include/device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp
...olution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp
+0
-181
driver/include/device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
..._convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+0
-178
driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
...device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+0
-214
driver/include/host_col2im.hpp
driver/include/host_col2im.hpp
+0
-71
driver/src/col2im_driver.cpp
driver/src/col2im_driver.cpp
+0
-385
driver/src/col2im_driver.cu
driver/src/col2im_driver.cu
+0
-1
driver/src/conv_bwd_data_driver.cpp
driver/src/conv_bwd_data_driver.cpp
+13
-13
driver/src/conv_bwd_data_driver.cu
driver/src/conv_bwd_data_driver.cu
+290
-1
driver/src/conv_bwd_data_driver.cu
driver/src/conv_bwd_data_driver.cu
+290
-1
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+0
-52
external/include/half.hpp
external/include/half.hpp
+5448
-4353
No files found.
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
506a823a
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_
convolution_kernel
_wrapper.hpp"
#include "gridwise_
operation
_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
template
<
typename
T
,
template
<
typename
T
,
...
@@ -770,47 +770,45 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -770,47 +770,45 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_conv
=
using
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
<
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
T
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvStrides
,
ConvDilations
,
ConvDilations
,
LeftPads
,
LeftPads
,
RightPads
,
RightPads
,
BPerBlock
,
ConvolutionDirection
::
Forward
,
KPerBlock
,
BPerBlock
,
EPerBlock
,
KPerBlock
,
GemmNRepeat
,
EPerBlock
,
GemmMPerThread
,
GemmNRepeat
,
GemmNPerThread
,
GemmMPerThread
,
GemmKPerThread
,
GemmNPerThread
,
GemmMLevel0Cluster
,
GemmKPerThread
,
GemmNLevel0Cluster
,
GemmMLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel0Cluster
,
GemmNLevel1Cluster
,
GemmMLevel1Cluster
,
GemmDataPerReadA
,
GemmNLevel1Cluster
,
GemmDataPerReadB
,
GemmDataPerReadA
,
InBlockCopySubLengths_E_N1_B_N2
,
GemmDataPerReadB
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopySrcAccessOrder
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstAccessOrder
,
InBlockCopyDstDataPerWrite_N2
,
InBlockCopySrcDataPerRead_B
,
WeiBlockCopySubLengths_E_K
,
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopyDstDataPerWrite_K
>
;
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
...
@@ -821,7 +819,10 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -821,7 +819,10 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
{
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
TDevice
>
,
launch_kernel
(
run_gridwise_operation
<
gridwise_conv
,
const
TDevice
*
const
__restrict__
,
const
TDevice
*
const
__restrict__
,
TDevice
*
const
__restrict__
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
80901f59
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
>
void
device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 0
// 1x1 filter, 8x8 image
constexpr index_t N0 = 1;
constexpr index_t Ho0 = 2;
constexpr index_t Wo0 = 1;
constexpr index_t N2 = 4;
constexpr index_t Ho2 = 1;
constexpr index_t Wo2 = 1;
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t EPerBlock = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<1, 1, 1, 1, 1, 4, 1, 1>;
using InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<8, 1, 2, 1, 16, 1, 1, 1>;
using InBlockCopyThreadClusterArrangeOrder =
Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using InBlockCopySrcAccessOrder =
Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using InBlockCopyDstAccessOrder =
Sequence<0, 1, 2, 3, 4, 5, 6, 7>; // [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2]
constexpr index_t InBlockCopyDataPerAccess_W2 = 1;
using WeiBlockCopySubLengths_E_K = Sequence<4, 1>;
using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif
1
// 1x1 filter, 8x8 image
constexpr
index_t
N0
=
1
;
constexpr
index_t
Ho0
=
2
;
constexpr
index_t
Wo0
=
1
;
constexpr
index_t
N2
=
2
;
constexpr
index_t
Ho2
=
2
;
constexpr
index_t
Wo2
=
1
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
=
Sequence
<
1
,
1
,
2
,
1
,
1
,
2
,
1
,
1
>
;
using
InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
=
Sequence
<
8
,
1
,
1
,
1
,
16
,
1
,
2
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
;
// [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2]
constexpr
index_t
InBlockCopyDataPerAccess_W2
=
1
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
constexpr
index_t
N1
=
N
/
(
N0
*
N2
);
constexpr
index_t
Ho1
=
Ho
/
(
Ho0
*
Ho2
);
constexpr
index_t
Wo1
=
Wo
/
(
Wo0
*
Wo2
);
constexpr
index_t
B
=
N1
*
Ho1
*
Wo1
;
constexpr
index_t
GridSize
=
((
B
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
N1
,
N2
,
Ho1
,
Ho2
,
Wo1
,
Wo2
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
,
InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_W2
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
float
time
=
launch_and_time_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
80901f59
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
>
void
device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
// 1x1 filter, 8x8 image
constexpr
index_t
N1
=
2
;
constexpr
index_t
Ho1
=
1
;
constexpr
index_t
Wo1
=
1
;
constexpr
index_t
N2
=
1
;
constexpr
index_t
Ho2
=
1
;
constexpr
index_t
Wo2
=
4
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2
=
Sequence
<
1
,
1
,
1
,
1
,
1
,
1
,
1
,
4
>
;
using
InBlockCopyClusterLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2
=
Sequence
<
8
,
2
,
1
,
1
,
16
,
1
,
1
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N1, N2, Ho1, Ho2, Wo1, B, Wo2]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N1, N2, Ho1, Ho2, Wo1, B, Wo2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
;
// [E, N1, Ho1, Wo1, B, N2, Ho2, Wo2]
constexpr
index_t
InBlockCopyDataPerAccess_W2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
constexpr
index_t
N0
=
N
/
(
N1
*
N2
);
constexpr
index_t
Ho0
=
Ho
/
(
Ho1
*
Ho2
);
constexpr
index_t
Wo0
=
Wo
/
(
Wo1
*
Wo2
);
constexpr
index_t
B
=
N0
*
Ho0
*
Wo0
;
constexpr
index_t
GridSize
=
((
B
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
N0
,
N1
,
N2
,
Ho0
,
Ho1
,
Ho2
,
Wo0
,
Wo1
,
Wo2
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2
,
InBlockCopyClusterLengths_E_N1_Ho1_Wo1_B_N2_Ho2_Wo2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_W2
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
float
time
=
launch_and_time_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
506a823a
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_
convolution_kernel
_wrapper.hpp"
#include "gridwise_
operation
_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
template
<
class
T
,
template
<
class
T
,
...
@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif
0
#elif
1
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
@@ -793,7 +793,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -793,7 +793,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
2
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
2
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif
1
#elif
0
// cdata = 64, BlockSize = 64, 32x128x3
// cdata = 64, BlockSize = 64, 32x128x3
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BlockSize
=
64
;
...
@@ -968,7 +968,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -968,7 +968,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
<
using
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
<
GridSize
,
GridSize
,
BlockSize
,
BlockSize
,
TDevice
,
TDevice
,
...
@@ -1000,7 +1000,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -1000,7 +1000,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
GemmBBlockCopySrcDataPerRead_GemmN
,
GemmBBlockCopySrcDataPerRead_GemmN
,
GemmBBlockCopyDstDataPerWrite_GemmN
,
GemmBBlockCopyDstDataPerWrite_GemmN
,
GemmCThreadCopyDstDataPerWrite_GemmN1
>
{}
;
GemmCThreadCopyDstDataPerWrite_GemmN1
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
...
@@ -1011,7 +1011,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -1011,7 +1011,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
{
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
TDevice
>
,
launch_kernel
(
run_gridwise_operation
<
gridwise_conv
,
const
TDevice
*
const
__restrict__
,
const
TDevice
*
const
__restrict__
,
TDevice
*
const
__restrict__
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
...
...
driver/include/device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
80901f59
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor
(
InDesc
::
GetLengths
(),
InDesc
::
GetStrides
());
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor
(
WeiDesc
::
GetLengths
(),
WeiDesc
::
GetStrides
());
constexpr
auto
out_nkhw_desc
=
make_native_tensor_descriptor
(
OutDesc
::
GetLengths
(),
OutDesc
::
GetStrides
());
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
K
=
out_nkhw_desc
.
GetLength
(
I1
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
// cdata = 64, BlockSize = 256, 128x128x16
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
64
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmKPACK
=
4
;
constexpr
index_t
GemmMPerWave
=
64
;
constexpr
index_t
GemmNPerWave
=
64
;
constexpr
index_t
ThreadGemmDataPerReadM
=
1
;
constexpr
index_t
ThreadGemmDataPerReadN
=
1
;
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM_GemmKPACK
=
Sequence
<
1
,
4
,
4
>
;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM_GemmKPACK
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmKPACK
=
1
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmKPACK
=
1
;
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN_GemmKPACK
=
Sequence
<
1
,
2
,
4
>
;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN_GemmKPACK
=
Sequence
<
4
,
32
,
1
>
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmKPACK
=
1
;
constexpr
index_t
GemmM
=
K
;
constexpr
index_t
GemmN
=
N
*
Ho
*
Wo
;
constexpr
index_t
GridSize
=
math
::
integer_divide_ceil
(
GemmM
,
GemmMPerBlock
)
*
math
::
integer_divide_ceil
(
GemmN
,
GemmNPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_xdlops_fwd_fp16_nchw_kcyx_nkhw
<
GridSize
,
BlockSize
,
half
,
float
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmKPACK
,
GemmMPerWave
,
GemmNPerWave
,
ThreadGemmDataPerReadM
,
ThreadGemmDataPerReadN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM_GemmKPACK
,
GemmABlockCopyThreadClusterLengths_GemmK_GemmM_GemmKPACK
,
GemmABlockCopySrcDataPerRead_GemmKPACK
,
GemmABlockCopyDstDataPerWrite_GemmKPACK
,
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN_GemmKPACK
,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN_GemmKPACK
,
GemmBBlockCopySrcDataPerRead_GemmN
,
GemmBBlockCopyDstDataPerWrite_GemmKPACK
>
{};
for
(
index_t
i
=
0
;
i
<
10
;
++
i
)
{
float
time
=
launch_and_time_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
}
// warm up
printf
(
"Warn up running %d times...
\n
"
,
nrepeat
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
}
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
cudaDeviceSynchronize
();
auto
start
=
std
::
chrono
::
steady_clock
::
now
();
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
}
cudaDeviceSynchronize
();
auto
end
=
std
::
chrono
::
steady_clock
::
now
();
float
ave_time
=
std
::
chrono
::
duration
<
float
,
std
::
milli
>
(
end
-
start
).
count
()
/
nrepeat
;
printf
(
"Average elapsed time : %f ms, %f TFlop/s
\n
"
,
ave_time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
);
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
80901f59
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor
(
InDesc
::
GetLengths
(),
InDesc
::
GetStrides
());
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor
(
WeiDesc
::
GetLengths
(),
WeiDesc
::
GetStrides
());
constexpr
auto
out_nkhw_desc
=
make_native_tensor_descriptor
(
OutDesc
::
GetLengths
(),
OutDesc
::
GetStrides
());
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
K
=
out_nkhw_desc
.
GetLength
(
I1
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
// cdata = 64, BlockSize = 256, 128x128x16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
16
;
constexpr
index_t
GemmMPerWave
=
64
;
constexpr
index_t
GemmNPerWave
=
64
;
constexpr
index_t
ThreadGemmDataPerReadM
=
1
;
constexpr
index_t
ThreadGemmDataPerReadN
=
1
;
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
2
>
;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
64
>
;
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmK
=
4
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmM
=
1
;
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
2
>
;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
4
,
64
>
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmM
=
K
;
constexpr
index_t
GemmN
=
N
*
Ho
*
Wo
;
constexpr
index_t
GridSize
=
math
::
integer_divide_ceil
(
GemmM
,
GemmMPerBlock
)
*
math
::
integer_divide_ceil
(
GemmN
,
GemmNPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_xdlops_fwd_fp32_nchw_kcyx_nkhw
<
GridSize
,
BlockSize
,
T
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerWave
,
GemmNPerWave
,
ThreadGemmDataPerReadM
,
ThreadGemmDataPerReadN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
GemmABlockCopySrcDataPerRead_GemmK
,
GemmABlockCopyDstDataPerWrite_GemmM
,
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
GemmBBlockCopySrcDataPerRead_GemmN
,
GemmBBlockCopyDstDataPerWrite_GemmN
>
{};
for
(
index_t
i
=
0
;
i
<
10
;
++
i
)
{
float
time
=
launch_and_time_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
}
// warm up
printf
(
"Warn up running %d times...
\n
"
,
nrepeat
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
}
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
cudaDeviceSynchronize
();
auto
start
=
std
::
chrono
::
steady_clock
::
now
();
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
}
cudaDeviceSynchronize
();
auto
end
=
std
::
chrono
::
steady_clock
::
now
();
float
ave_time
=
std
::
chrono
::
duration
<
float
,
std
::
milli
>
(
end
-
start
).
count
()
/
nrepeat
;
printf
(
"Average elapsed time : %f ms, %f TFlop/s
\n
"
,
ave_time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
);
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
80901f59
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
template
<
class
TInWei
,
class
TOut
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
TInWei
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
TInWei
>&
wei_kcyx
,
OutDesc
,
Tensor
<
TOut
>&
out_nkhw
,
index_t
nrepeat
)
{
// this suppose in / wei data type is int8x4
constexpr
index_t
NVector
=
4
;
using
accum_t
=
int32_t
;
using
vector_t
=
vector_type
<
TInWei
,
NVector
>
;
using
vector_mem_t
=
typename
vector_t
::
MemoryType
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// vectorized input
auto
in_nchw_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
/
NVector
,
Hi
,
Wi
>
{});
ostream_ConstantTensorDescriptor
(
in_nchw_vec_desc
,
std
::
cout
<<
"in_nchw_vec_desc: "
);
Tensor
<
vector_mem_t
>
in_nchw_vec
(
make_TensorDescriptor
(
in_nchw_vec_desc
));
auto
f_vectorized_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
h
,
auto
w
)
{
#if 0
in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
#elif
0
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_t
::
Pack
(
in_nchw
(
n
,
2
*
c
,
h
,
w
),
in_nchw
(
n
,
2
*
c
+
1
,
h
,
w
));
#elif 1
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_t
::
Pack
(
in_nchw
(
n
,
4
*
c
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
1
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
2
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
3
,
h
,
w
));
#endif
};
make_ParallelTensorFunctor
(
f_vectorized_nchw
,
N
,
C
/
NVector
,
Hi
,
Wi
)(
std
::
thread
::
hardware_concurrency
());
// vectorize weight
auto
wei_kcyx_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
/
NVector
,
Y
,
X
>
{});
ostream_ConstantTensorDescriptor
(
wei_kcyx_vec_desc
,
std
::
cout
<<
"wei_kcyx_vec_desc: "
);
Tensor
<
vector_mem_t
>
wei_kcyx_vec
(
make_TensorDescriptor
(
wei_kcyx_vec_desc
));
auto
f_vectorized_kcyx
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
#if 0
wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
#elif
0
wei_kcyx_vec
(
k
,
c
,
y
,
x
)
=
vector_t
::
Pack
(
wei_kcyx
(
k
,
2
*
c
,
y
,
x
),
wei_kcyx
(
k
,
2
*
c
+
1
,
y
,
x
));
#elif 1
wei_kcyx_vec
(
k
,
c
,
y
,
x
)
=
vector_t
::
Pack
(
wei_kcyx
(
k
,
4
*
c
,
y
,
x
),
wei_kcyx
(
k
,
4
*
c
+
1
,
y
,
x
),
wei_kcyx
(
k
,
4
*
c
+
2
,
y
,
x
),
wei_kcyx
(
k
,
4
*
c
+
3
,
y
,
x
));
#endif
};
make_ParallelTensorFunctor
(
f_vectorized_kcyx
,
K
,
C
/
NVector
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
//
DeviceMem
in_nchw_vec_device_buf
(
sizeof
(
vector_mem_t
)
*
in_nchw_vec
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_vec_device_buf
(
sizeof
(
vector_mem_t
)
*
wei_kcyx_vec
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
sizeof
(
TOut
)
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_vec_device_buf
.
ToDevice
(
in_nchw_vec
.
mData
.
data
());
wei_kcyx_vec_device_buf
.
ToDevice
(
wei_kcyx_vec
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 0
// 3x3, 34x34, 128 thread, fp32, vector = 1
constexpr index_t NPerBlock = 2;
constexpr index_t KPerBlock = 32;
constexpr index_t CPerBlock = 4;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 32;
constexpr index_t NPerThread = 2;
constexpr index_t KPerThread = 4;
constexpr index_t CPerThread = 2;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t InBlockCopyDataPerRead = 2;
constexpr index_t WeiBlockCopyDataPerRead = 2;
constexpr index_t BlockSize = 128;
#elif
0
// 3x3, 34x34, 128 thread, fp32, vector = 2
constexpr
index_t
NPerBlock
=
2
;
constexpr
index_t
KPerBlock
=
32
;
constexpr
index_t
CPerBlock
=
2
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
NPerThread
=
2
;
constexpr
index_t
KPerThread
=
4
;
constexpr
index_t
CPerThread
=
1
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
InBlockCopyDataPerRead
=
2
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
2
;
constexpr
index_t
BlockSize
=
128
;
#elif 0
// 3x3, 34x34, 128 thread, int8, vector = 4
constexpr
index_t
NPerBlock
=
2
;
constexpr
index_t
KPerBlock
=
32
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
4
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
NPerThread
=
1
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
CPerThread
=
2
;
constexpr
index_t
HoPerThread
=
4
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
InBlockCopyDataPerRead
=
2
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
2
;
constexpr
index_t
BlockSize
=
128
;
#elif 1
// 1x1, 32x32, 128 thread, int8, vector = 4
constexpr
index_t
NPerBlock
=
1
;
constexpr
index_t
KPerBlock
=
64
;
constexpr
index_t
CPerBlock
=
16
;
constexpr
index_t
HoPerBlock
=
4
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
NPerThread
=
1
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
CPerThread
=
2
;
constexpr
index_t
HoPerThread
=
4
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
InBlockCopyDataPerRead
=
2
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
2
;
constexpr
index_t
BlockSize
=
128
;
#endif
constexpr
index_t
GridSize
=
(
N
/
NPerBlock
)
*
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_and_time_kernel
(
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
<
TInWei
,
TOut
,
accum_t
,
decltype
(
in_nchw_vec_desc
),
decltype
(
wei_kcyx_vec_desc
),
decltype
(
out_nkhw_desc
),
NVector
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
,
BlockSize
,
GridSize
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
TInWei
*>
(
in_nchw_vec_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
wei_kcyx_vec_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/include/host_col2im.hpp
deleted
100644 → 0
View file @
80901f59
#pragma once
#include "tensor.hpp"
template
<
typename
T
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
>
void
host_col2im
(
const
Tensor
<
T
>&
in_eb
,
Tensor
<
T
>&
in_nchw
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
)
{
using
namespace
ck
;
int
N
=
in_nchw
.
mDesc
.
GetLengths
()[
0
];
int
C
=
in_nchw
.
mDesc
.
GetLengths
()[
1
];
int
HI
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
int
WI
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
int
Y
=
FilterSizes
{}[
0
];
int
X
=
FilterSizes
{}[
1
];
int
HO
=
OutputSizes
{}[
0
];
int
WO
=
OutputSizes
{}[
1
];
auto
f
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
double
v
=
0
;
for
(
int
y
=
0
;
y
<
Y
;
++
y
)
{
int
h_tmp
=
hi
+
LeftPads
{}[
0
]
-
y
*
ConvDilations
{}[
0
];
if
(
h_tmp
>=
0
&&
h_tmp
<
HI
&&
h_tmp
%
ConvStrides
{}[
0
]
==
0
)
{
int
ho
=
h_tmp
/
ConvStrides
{}[
0
];
for
(
int
x
=
0
;
x
<
X
;
++
x
)
{
int
w_tmp
=
wi
+
LeftPads
{}[
1
]
-
x
*
ConvDilations
{}[
1
];
if
(
w_tmp
>=
0
&&
w_tmp
<
WI
&&
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
{
int
wo
=
w_tmp
/
ConvStrides
{}[
1
];
int
e
=
c
*
(
Y
*
X
)
+
y
*
X
+
x
;
int
b
=
n
*
(
HO
*
WO
)
+
ho
*
WO
+
wo
;
v
+=
in_eb
(
e
,
b
);
}
}
}
}
in_nchw
(
n
,
c
,
hi
,
wi
)
=
v
;
};
auto
f_par
=
make_ParallelTensorFunctor
(
f
,
in_nchw
.
mDesc
.
GetLengths
()[
0
],
in_nchw
.
mDesc
.
GetLengths
()[
1
],
in_nchw
.
mDesc
.
GetLengths
()[
2
],
in_nchw
.
mDesc
.
GetLengths
()[
3
]);
f_par
(
std
::
thread
::
hardware_concurrency
());
}
driver/src/col2im_driver.cpp
deleted
100644 → 0
View file @
80901f59
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "print_array.hpp"
#include "print_sequence.hpp"
#include "device.hpp"
#include "tensor_generator.hpp"
#include "device_tensor.hpp"
#include "conv_common.hpp"
#include "host_col2im.hpp"
#include "device_col2im_eb_nchw.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
#if 1
constexpr
index_t
N
=
2
;
constexpr
index_t
C
=
8
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
4
;
constexpr
index_t
X
=
4
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
#elif 0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
2048
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1280
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 28x28 image
// cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 17x17 input
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
768
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
288
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 5x5 filter, 2x2 pad, 7x7 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
48
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
X
=
5
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
2
,
2
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
#elif 0
// 7x1 filter, 3x0 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif 1
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
#endif
constexpr
auto
img_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
img_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
constexpr
index_t
HO
=
out_nkhw_desc
.
GetLengths
()[
2
];
constexpr
index_t
WO
=
out_nkhw_desc
.
GetLengths
()[
3
];
constexpr
auto
col_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
using
FilterSizes
=
Sequence
<
Y
,
X
>
;
using
OutputSizes
=
Sequence
<
HO
,
WO
>
;
ostream_ConstantTensorDescriptor
(
col_eb_desc
,
std
::
cout
<<
"col_eb_desc: "
);
ostream_ConstantTensorDescriptor
(
img_nchw_desc
,
std
::
cout
<<
"img_nchw_desc: "
);
print_sequence
(
"FilterSizes"
,
FilterSizes
{});
print_sequence
(
"OutputSizes"
,
OutputSizes
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"RightPads"
,
RightPads
{});
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
Tensor
<
float
>
col_eb
(
make_TensorDescriptor
(
col_eb_desc
));
Tensor
<
float
>
img_nchw_host
(
make_TensorDescriptor
(
img_nchw_desc
));
Tensor
<
float
>
img_nchw_device
(
make_TensorDescriptor
(
img_nchw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
argc
!=
3
)
{
printf
(
"arg1: do_verification, arg2: nrepeat
\n
"
);
exit
(
1
);
}
bool
do_verification
=
atoi
(
argv
[
1
]);
std
::
size_t
nrepeat
=
atoi
(
argv
[
2
]);
if
(
do_verification
)
{
#if 0
col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else
col_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
}
device_col2im_eb_nchw
(
col_eb_desc
,
col_eb
,
img_nchw_desc
,
img_nchw_device
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
if
(
do_verification
)
{
host_col2im
(
col_eb
,
img_nchw_host
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
check_error
(
img_nchw_host
,
img_nchw_device
);
#if 0
LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
#endif
}
}
driver/src/col2im_driver.cu
deleted
120000 → 0
View file @
80901f59
col2im_driver
.
cpp
\ No newline at end of file
driver/src/conv_bwd_data_driver.cpp
View file @
506a823a
...
@@ -245,9 +245,9 @@ int main(int argc, char* argv[])
...
@@ -245,9 +245,9 @@ int main(int argc, char* argv[])
#endif
#endif
}
}
#if
1
#if
0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif
1
#elif
0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 0
#elif 0
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
...
@@ -256,17 +256,17 @@ int main(int argc, char* argv[])
...
@@ -256,17 +256,17 @@ int main(int argc, char* argv[])
#elif 1
#elif 1
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
#endif
#endif
(
in_nchw_desc
,
(
in_nchw_desc
,
in_nchw_device
,
in_nchw_device
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw
,
out_nkhw
,
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
if
(
do_verification
)
if
(
do_verification
)
{
{
...
...
driver/src/conv_bwd_data_driver.cu
deleted
120000 → 0
View file @
80901f59
conv_bwd_data_driver
.
cpp
\ No newline at end of file
driver/src/conv_bwd_data_driver.cu
0 → 100644
View file @
506a823a
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "print_array.hpp"
#include "print_sequence.hpp"
#include "device.hpp"
#include "tensor_generator.hpp"
#include "device_tensor.hpp"
#include "conv_common.hpp"
#include "host_conv_bwd_data.hpp"
#include "device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
launcher
;
#if 0
constexpr index_t N = 64;
constexpr index_t C = 256;
constexpr index_t HI = 56;
constexpr index_t WI = 56;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif
0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 28x28
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 1x1 filter, 8x8 image
constexpr
index_t
N
=
256
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 7x7 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 14x14 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 5x5 filter, 2x2 pad, 7x7 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
X
=
5
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
2
,
2
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
#elif 0
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
#elif 0
// 7x1 filter, 3x0 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif 1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#endif
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"RightPads"
,
RightPads
{});
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
Tensor
<
float
>
in_nchw_device
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
float
>
in_nchw_host
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
float
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
float
>
out_nkhw
(
make_TensorDescriptor
(
out_nkhw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
argc
!=
3
)
{
printf
(
"arg1: do_verification, arg2: nrepeat
\n
"
);
exit
(
1
);
}
bool
do_verification
=
atoi
(
argv
[
1
]);
std
::
size_t
nrepeat
=
atoi
(
argv
[
2
]);
if
(
do_verification
)
{
#if 0
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
#else
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
out_nkhw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
}
#if 0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif
0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 0
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
#elif 0
device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw
#elif 1
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
#endif
(
in_nchw_desc
,
in_nchw_device
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
if
(
do_verification
)
{
host_direct_convolution_backward_data
(
in_nchw_host
,
wei_kcyx
,
out_nkhw
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
check_error
(
in_nchw_host
,
in_nchw_device
);
#if 0
LogRange(std::cout << "out_nkhw : ", out_nkhw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kcyx : ", wei_kcyx.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl;
#endif
}
}
driver/src/conv_driver.cpp
View file @
506a823a
...
@@ -15,9 +15,6 @@
...
@@ -15,9 +15,6 @@
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_fp16.hpp"
//#include "device_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw.hpp"
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
...
@@ -570,31 +567,6 @@ int main(int argc, char* argv[])
...
@@ -570,31 +567,6 @@ int main(int argc, char* argv[])
}
}
#if 0
#if 0
device_convolution_direct_v2_nchw_kcyx_nkhw
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif
0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
in_nchw,
wei_kcyx_desc,
wei_kcyx_desc,
...
@@ -618,30 +590,6 @@ int main(int argc, char* argv[])
...
@@ -618,30 +590,6 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_fp16
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v4r4_xdlops_fp16_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#endif
#endif
if
(
do_verification
)
if
(
do_verification
)
...
...
external/include/half.hpp
View file @
506a823a
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment