Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e80fbbdd
Commit
e80fbbdd
authored
Feb 14, 2019
by
Chao Liu
Browse files
refactor build, clean up
parent
28354a0f
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
486 additions
and
396 deletions
+486
-396
CMakeLists.txt
CMakeLists.txt
+6
-13
driver/CMakeLists.txt
driver/CMakeLists.txt
+2
-2
driver/conv.cu
driver/conv.cu
+4
-5
driver/device_direct_convolution_1.cuh
driver/device_direct_convolution_1.cuh
+32
-53
driver/device_direct_convolution_2.cuh
driver/device_direct_convolution_2.cuh
+33
-53
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+37
-58
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
...ice_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
+18
-23
driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh
driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh
+94
-0
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+33
-53
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+45
-50
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+36
-42
src/CMakeLists.txt
src/CMakeLists.txt
+15
-11
src/device.cu
src/device.cu
+79
-0
src/include/blockwise_2d_tensor_op.cuh
src/include/blockwise_2d_tensor_op.cuh
+0
-2
src/include/blockwise_4d_tensor_op.cuh
src/include/blockwise_4d_tensor_op.cuh
+6
-6
src/include/device.hpp
src/include/device.hpp
+30
-0
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+3
-6
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+3
-6
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
...e/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+3
-6
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
...ise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
+7
-7
No files found.
CMakeLists.txt
View file @
e80fbbdd
cmake_minimum_required
(
VERSION
3.9
)
cmake_minimum_required
(
VERSION
2.8.3
)
project
(
convolution
LANGUAGES CXX CUDA
)
project
(
modular_
convolution
)
#c++
#c++
enable_language
(
CXX
)
set
(
CMAKE_CXX_STANDARD 14
)
set
(
CMAKE_CXX_STANDARD_REQUIRED ON
)
message
(
"CMAKE_CXX_COMPILER_ID:
${
CMAKE_CXX_COMPILER_ID
}
"
)
message
(
"CMAKE_CXX_COMPILER_ID:
${
CMAKE_CXX_COMPILER_ID
}
"
)
add_compile_options
(
-std=c++14
)
#boost
#boost
find_package
(
Boost REQUIRED
)
find_package
(
Boost REQUIRED
)
...
@@ -28,20 +30,11 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
...
@@ -28,20 +30,11 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
link_libraries
(
${
OpenMP_pthread_LIBRARY
}
)
link_libraries
(
${
OpenMP_pthread_LIBRARY
}
)
endif
(
NOT
(
${
CMAKE_CXX_COMPILER_ID
}
STREQUAL
"AppleClang"
)
)
endif
(
NOT
(
${
CMAKE_CXX_COMPILER_ID
}
STREQUAL
"AppleClang"
)
)
#python
find_package
(
PythonLibs 3 REQUIRED
)
message
(
"PYTHON_INCLUDE_DIRS:
${
PYTHON_INCLUDE_DIRS
}
"
)
message
(
"PYTHON_LIBRARIES:
${
PYTHON_LIBRARIES
}
"
)
include_directories
(
BEFORE
${
PYTHON_INCLUDE_DIRS
}
)
link_libraries
(
${
PYTHON_LIBRARIES
}
)
#cuda
#cuda
enable_language
(
CUDA
)
include_directories
(
BEFORE
${
CUDA_COMMON_INCLUDE_DIR
}
)
include_directories
(
BEFORE
${
CUDA_COMMON_INCLUDE_DIR
}
)
#
#
include_directories
(
BEFORE src/include
)
include_directories
(
BEFORE src/include
)
add_subdirectory
(
src
)
add_subdirectory
(
src
)
add_subdirectory
(
driver
)
add_subdirectory
(
driver
)
driver/CMakeLists.txt
View file @
e80fbbdd
add_executable
(
conv
EXCLUDE_FROM_ALL
conv.cu
)
add_executable
(
conv conv.cu
)
target_link_libraries
(
conv
convolution
)
target_link_libraries
(
conv
tensor device
)
driver/conv.cu
View file @
e80fbbdd
...
@@ -2,13 +2,12 @@
...
@@ -2,13 +2,12 @@
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include "nvToolsExt.h"
#include "tensor.hpp"
#include "tensor.hpp"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "conv_common.cuh"
#include "conv_common.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_1_nchw_kcsr
_nkhw
.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
...
@@ -590,7 +589,7 @@ int main()
...
@@ -590,7 +589,7 @@ int main()
#elif
0
#elif
0
device_direct_convolution_2
device_direct_convolution_2
#elif 0
#elif 0
device_implicit_gemm_convolution_1_nchw_kcsr
device_implicit_gemm_convolution_1_nchw_kcsr
_nkhw
#elif 0
#elif 0
device_implicit_gemm_convolution_1_nchw_srck_nkhw
device_implicit_gemm_convolution_1_nchw_srck_nkhw
#elif 0
#elif 0
...
@@ -602,7 +601,7 @@ int main()
...
@@ -602,7 +601,7 @@ int main()
#endif
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
1
#elif
0
device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
(
in_nchw_desc
,
device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr_desc
,
...
@@ -614,7 +613,7 @@ int main()
...
@@ -614,7 +613,7 @@ int main()
nrepeat
);
nrepeat
);
#endif
#endif
#if
0
#if
1
if
(
S
==
3
&&
R
==
3
)
if
(
S
==
3
&&
R
==
3
)
{
{
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
...
...
driver/device_direct_convolution_1.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include "gridwise_direct_convolution_1.cuh"
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_1.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_1
(
InDesc
,
void
device_direct_convolution_1
(
InDesc
,
...
@@ -32,6 +33,7 @@ void device_direct_convolution_1(InDesc,
...
@@ -32,6 +33,7 @@ void device_direct_convolution_1(InDesc,
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
#if 1
#if 1
// 3x3, 34x34
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
...
@@ -45,20 +47,6 @@ void device_direct_convolution_1(InDesc,
...
@@ -45,20 +47,6 @@ void device_direct_convolution_1(InDesc,
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
16
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
YPerBlock
=
2
;
constexpr
unsigned
XPerBlock
=
27
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
BlockSize
=
216
;
#endif
#endif
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
...
@@ -73,13 +61,7 @@ void device_direct_convolution_1(InDesc,
...
@@ -73,13 +61,7 @@ void device_direct_convolution_1(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
gridwise_direct_convolution_1
<
T
,
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_direct_convolution_1
<
T
,
InDesc
,
InDesc
,
WeiDesc
,
WeiDesc
,
OutDesc
,
OutDesc
,
...
@@ -94,24 +76,21 @@ void device_direct_convolution_1(InDesc,
...
@@ -94,24 +76,21 @@ void device_direct_convolution_1(InDesc,
KPerThread
,
KPerThread
,
CPerThread
,
CPerThread
,
BlockSize
,
BlockSize
,
GridSize
>
GridSize
>
);
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
());
WeiDesc
{},
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
());
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
=
0
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
10000
);
}
}
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
}
driver/device_direct_convolution_2.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include "gridwise_direct_convolution_2.cuh"
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_2.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2
(
InDesc
,
void
device_direct_convolution_2
(
InDesc
,
...
@@ -32,6 +33,7 @@ void device_direct_convolution_2(InDesc,
...
@@ -32,6 +33,7 @@ void device_direct_convolution_2(InDesc,
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
#if 1
#if 1
// 3x3, 34x34, 128 thread
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
...
@@ -46,20 +48,7 @@ void device_direct_convolution_2(InDesc,
...
@@ -46,20 +48,7 @@ void device_direct_convolution_2(InDesc,
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
#elif 0
constexpr
unsigned
OutTileSizeH
=
2
;
// 3x3, 34x34, 256 thread
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
YPerBlock
=
1
;
constexpr
unsigned
XPerBlock
=
27
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
BlockSize
=
216
;
#elif 0
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
...
@@ -87,13 +76,7 @@ void device_direct_convolution_2(InDesc,
...
@@ -87,13 +76,7 @@ void device_direct_convolution_2(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
gridwise_direct_convolution_2
<
T
,
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_direct_convolution_2
<
T
,
InDesc
,
InDesc
,
WeiDesc
,
WeiDesc
,
OutDesc
,
OutDesc
,
...
@@ -108,24 +91,21 @@ void device_direct_convolution_2(InDesc,
...
@@ -108,24 +91,21 @@ void device_direct_convolution_2(InDesc,
KPerThread
,
KPerThread
,
CPerThread
,
CPerThread
,
BlockSize
,
BlockSize
,
GridSize
>
GridSize
>
);
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
());
WeiDesc
{},
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
());
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
=
0
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
10000
);
}
}
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
}
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_1_chwn_csrk_khwn
(
InDesc
,
void
device_implicit_gemm_convolution_1_chwn_csrk_khwn
(
InDesc
,
...
@@ -73,21 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -73,21 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
wei_csrk_device_buf
.
ToDevice
(
wei_csrk
.
mData
.
data
());
wei_csrk_device_buf
.
ToDevice
(
wei_csrk
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
#if 0
#if 1
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 4;
constexpr unsigned NPerThread = 1;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 8;
#elif
0
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
...
@@ -214,12 +201,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -214,12 +201,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn
<
GridSize
,
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
...
@@ -239,25 +221,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -239,25 +221,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
>
);
<<<
grid_dim
,
block_dim
>>>
(
in_chwn_desc
,
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
());
wei_csrk_desc
,
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
()),
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
());
out_khwn_desc
,
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
=
0
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
std
::
min
(
elapsedTime
*
1000
,
float
(
10000
)));
}
}
checkCudaErrors
(
cudaGetLastError
());
out_khwn_device_buf
.
FromDevice
(
out_khwn
.
mData
.
data
());
out_khwn_device_buf
.
FromDevice
(
out_khwn
.
mData
.
data
());
// reorder output
// reorder output
...
...
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh"
#include <unistd.h>
#include <algorithm>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
void
device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
(
InDesc
,
void
device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
(
InDesc
,
...
@@ -172,7 +172,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -172,7 +172,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
0
#elif
1
// 3x3 56x56, NKC = 16,256,128, with padding
// 3x3 56x56, NKC = 16,256,128, with padding
// 3x3 28x28, NKC = 16,512,256, with padding
// 3x3 28x28, NKC = 16,512,256, with padding
// 3x3 20x84, NKC = 16,256,256, with padding
// 3x3 20x84, NKC = 16,256,256, with padding
...
@@ -222,7 +222,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -222,7 +222,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
// for 1x1, 28x28
// for 1x1, 28x28
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
128
;
...
@@ -253,13 +253,8 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -253,13 +253,8 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
float
elapsedTime
;
#if 0
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
#if 1
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
#elif
1
#elif
1
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline
...
@@ -283,22 +278,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -283,22 +278,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
HoPerThread
,
HoPerThread
,
WoPerThread
,
WoPerThread
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
WeiBlockCopyThreadPerDim1
>
);
<<<
grid_dim
,
block_dim
>>>
(
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
());
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
());
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
=
0
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
elapsedTime
*
1000
,
float
(
10000
)));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
}
checkCudaErrors
(
cudaGetLastError
());
out_khwn_device_buf
.
FromDevice
(
out_khwn
.
mData
.
data
());
out_khwn_device_buf
.
FromDevice
(
out_khwn
.
mData
.
data
());
// reorder output
// reorder output
...
...
driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh
→
driver/device_implicit_gemm_convolution_1_nchw_kcsr
_nkhw
.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_1_nchw_kcsr
(
InDesc
,
void
device_implicit_gemm_convolution_1_nchw_kcsr
_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
const
Tensor
<
T
>&
wei
,
...
@@ -31,20 +32,8 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
...
@@ -31,20 +32,8 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
#if 0
#if 1
constexpr unsigned NPerBlock = 1;
// 3x3, 34x34
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
#elif
1
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
CPerBlock
=
2
;
...
@@ -56,19 +45,6 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
...
@@ -56,19 +45,6 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
4
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
@@ -83,13 +59,8 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
...
@@ -83,13 +59,8 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
float
elapsedTime
;
gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw
<
GridSize
,
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_1_nchw_kcsr
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
InDesc
,
InDesc
,
...
@@ -103,24 +74,21 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
...
@@ -103,24 +74,21 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
KPerThread
,
KPerThread
,
CPerThread
,
CPerThread
,
HoPerThread
,
HoPerThread
,
WoPerThread
>
WoPerThread
>
);
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
());
WeiDesc
{},
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
());
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
=
0
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
10000
);
}
}
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
}
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_1_nchw_srck_nkhw
(
InDesc
,
void
device_implicit_gemm_convolution_1_nchw_srck_nkhw
(
InDesc
,
...
@@ -52,20 +53,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
...
@@ -52,20 +53,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
wei_srck_device_buf
.
ToDevice
(
wei_srck
.
mData
.
data
());
wei_srck_device_buf
.
ToDevice
(
wei_srck
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 0
#if 1
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
#elif
0
// for 3x3, 34x34
// for 3x3, 34x34
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
...
@@ -123,12 +111,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
...
@@ -123,12 +111,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
<
GridSize
,
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
...
@@ -144,24 +127,21 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
...
@@ -144,24 +127,21 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
KPerThread
,
KPerThread
,
CPerThread
,
CPerThread
,
HoPerThread
,
HoPerThread
,
WoPerThread
>
WoPerThread
>
);
<<<
grid_dim
,
block_dim
>>>
(
in_nchw_desc
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
());
wei_srck_desc
,
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_srck_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
wei_srck_device_buf
.
GetDeviceBuffer
()),
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
());
out_nkhw_desc
,
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
=
0
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
10000
);
}
}
checkCudaErrors
(
cudaGetLastError
());
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
}
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh"
#include <unistd.h>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_2_cnhw_csrk_knhw
(
InDesc
,
void
device_implicit_gemm_convolution_2_cnhw_csrk_knhw
(
InDesc
,
...
@@ -69,6 +70,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
...
@@ -69,6 +70,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
#if 0
#if 0
// 3x3, 34x34
// 3x3, 34x34
// need to use register double buffer for GEMM
constexpr unsigned BPerBlock = 128;
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned CPerBlock = 4;
...
@@ -211,11 +213,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
...
@@ -211,11 +213,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
#if 0
#if 0
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
#else
#else
...
@@ -246,25 +244,22 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
...
@@ -246,25 +244,22 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
>
);
<<<
grid_dim
,
block_dim
>>>
(
in_cnhw_desc
,
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
());
wei_csrk_desc
,
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
()),
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_knhw_device_buf
.
GetDeviceBuffer
());
out_knhw_desc
,
static_cast
<
T
*>
(
out_knhw_device_buf
.
GetDeviceBuffer
()));
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
std
::
min
(
elapsedTime
*
1000
,
float
(
10000
)));
}
}
checkCudaErrors
(
cudaGetLastError
());
out_knhw_device_buf
.
FromDevice
(
out_knhw
.
mData
.
data
());
out_knhw_device_buf
.
FromDevice
(
out_knhw
.
mData
.
data
());
// convert out_knhw to out_nkhw
// convert out_knhw to out_nkhw
...
...
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
e80fbbdd
#pragma once
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh"
#include <unistd.h>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_2_cnhw_srck_knhw
(
InDesc
,
void
device_implicit_gemm_convolution_2_cnhw_srck_knhw
(
InDesc
,
...
@@ -100,7 +101,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
...
@@ -100,7 +101,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
// 1x1, 28x28
// 1x1, 28x28
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
...
@@ -140,12 +141,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
...
@@ -140,12 +141,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
cudaEvent_t
start
,
stop
;
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
float
elapsedTime
;
#if 1
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
#if 0
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
#else
#else
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
...
@@ -165,25 +162,22 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
...
@@ -165,25 +162,22 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
GemmThreadPerColumnPerCluster
,
GemmThreadPerColumnPerCluster
,
GemmThreadPerRowPerCluster
,
GemmThreadPerRowPerCluster
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
InBlockCopyThreadPerDim1
>
);
<<<
grid_dim
,
block_dim
>>>
(
in_cnhw_desc
,
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
()),
T
*
in_dev_ptr
=
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
());
wei_srck_desc
,
T
*
wei_dev_ptr
=
static_cast
<
T
*>
(
wei_srck_device_buf
.
GetDeviceBuffer
());
static_cast
<
T
*>
(
wei_srck_device_buf
.
GetDeviceBuffer
()),
T
*
out_dev_ptr
=
static_cast
<
T
*>
(
out_knhw_device_buf
.
GetDeviceBuffer
());
out_knhw_desc
,
static_cast
<
T
*>
(
out_knhw_device_buf
.
GetDeviceBuffer
()));
void
*
args
[]
=
{
&
in_dev_ptr
,
&
wei_dev_ptr
,
&
out_dev_ptr
};
cudaEventCreate
(
&
stop
);
float
time
=
0
;
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
launch_kernel
(
f
,
grid_dim
,
block_dim
,
args
,
time
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
std
::
min
(
elapsedTime
*
1000
,
float
(
10000
)));
}
}
checkCudaErrors
(
cudaGetLastError
());
out_knhw_device_buf
.
FromDevice
(
out_knhw
.
mData
.
data
());
out_knhw_device_buf
.
FromDevice
(
out_knhw
.
mData
.
data
());
// convert out_knhw to out_nkhw
// convert out_knhw to out_nkhw
...
...
src/CMakeLists.txt
View file @
e80fbbdd
set
(
SOURCE
set
(
TENSOR_
SOURCE
tensor.cpp;
tensor.cpp;
)
)
add_library
(
convolution SHARED
${
SOURCE
}
)
add_library
(
tensor SHARED
${
TENSOR_SOURCE
}
)
set_target_properties
(
convolution PROPERTIES PREFIX
""
)
set_target_properties
(
tensor PROPERTIES PREFIX
""
)
target_compile_features
(
tensor PUBLIC
)
set_target_properties
(
tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS tensor LIBRARY DESTINATION lib
)
# boost.python
target_link_libraries
(
convolution boost_python3
)
# cuda
set
(
DEVICE_SOURCE
target_link_libraries
(
convolution nvToolsExt cudart
)
device.cu;
target_compile_features
(
convolution PUBLIC
)
)
set_target_properties
(
convolution PROPERTIES POSITION_INDEPENDENT_CODE ON
)
set_target_properties
(
convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF
)
install
(
TARGETS convolution LIBRARY DESTINATION lib
)
add_library
(
device SHARED
${
DEVICE_SOURCE
}
)
set_target_properties
(
device PROPERTIES PREFIX
""
)
target_compile_features
(
device PUBLIC
)
set_target_properties
(
device PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device LIBRARY DESTINATION lib
)
target_link_libraries
(
device nvToolsExt cudart
)
src/device.cu
0 → 100644
View file @
e80fbbdd
#include "device.hpp"
#include "cuda_runtime.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
checkCudaErrors
(
cudaMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
void
*
DeviceMem
::
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
{
checkCudaErrors
(
cudaMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
cudaMemcpyHostToDevice
));
}
void
DeviceMem
::
FromDevice
(
void
*
p
)
{
checkCudaErrors
(
cudaMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
cudaMemcpyDeviceToHost
));
}
DeviceMem
::~
DeviceMem
()
{
checkCudaErrors
(
cudaFree
(
mpDeviceBuf
));
}
struct
KernelTimerImpl
{
KernelTimerImpl
()
{
cudaEventCreate
(
&
mStart
);
cudaEventCreate
(
&
mEnd
);
}
~
KernelTimerImpl
()
{
cudaEventDestroy
(
mStart
);
cudaEventDestroy
(
mEnd
);
}
void
Start
()
{
cudaEventRecord
(
mStart
,
0
);
}
void
End
()
{
cudaEventRecord
(
mEnd
,
0
);
cudaEventSynchronize
(
mEnd
);
}
float
GetElapsedTime
()
const
{
float
time
;
cudaEventElapsedTime
(
&
time
,
mStart
,
mEnd
);
return
time
;
}
cudaEvent_t
mStart
,
mEnd
;
};
KernelTimer
::
KernelTimer
()
:
impl
(
new
KernelTimerImpl
())
{}
KernelTimer
::~
KernelTimer
()
{}
void
KernelTimer
::
Start
()
{
impl
->
Start
();
}
void
KernelTimer
::
End
()
{
impl
->
End
();
}
float
KernelTimer
::
GetElapsedTime
()
const
{
return
impl
->
GetElapsedTime
();
}
void
launch_kernel
(
const
void
*
func
,
dim3
grid_dim
,
dim3
block_dim
,
void
**
args
,
float
&
time
)
{
KernelTimer
timer
;
timer
.
Start
();
cudaError_t
error
=
cudaLaunchKernel
(
func
,
grid_dim
,
block_dim
,
args
,
0
,
0
);
timer
.
End
();
time
=
timer
.
GetElapsedTime
();
checkCudaErrors
(
error
);
}
src/include/blockwise_2d_tensor_op.cuh
View file @
e80fbbdd
...
@@ -513,7 +513,6 @@ struct Blockwise2dTensorCopy3
...
@@ -513,7 +513,6 @@ struct Blockwise2dTensorCopy3
}
}
}
}
#if 1
__device__
constexpr
unsigned
GetRegisterClipboardSize
()
const
__device__
constexpr
unsigned
GetRegisterClipboardSize
()
const
{
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
...
@@ -703,5 +702,4 @@ struct Blockwise2dTensorCopy3
...
@@ -703,5 +702,4 @@ struct Blockwise2dTensorCopy3
}
}
}
}
}
}
#endif
};
};
src/include/blockwise_4d_tensor_op.cuh
View file @
e80fbbdd
...
@@ -88,7 +88,7 @@ template <unsigned BlockSize,
...
@@ -88,7 +88,7 @@ template <unsigned BlockSize,
class
F
>
class
F
>
__device__
void
blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
__device__
void
blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
,
SrcDesc
,
Float
*
const
__restrict__
p_src
,
const
Float
*
__restrict__
p_src
,
DstDesc
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
SrcOpLengths
,
...
@@ -187,7 +187,7 @@ template <unsigned BlockSize,
...
@@ -187,7 +187,7 @@ template <unsigned BlockSize,
class
DstFromSrcReorder
>
class
DstFromSrcReorder
>
__device__
void
__device__
void
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
,
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
const
Float
*
__restrict__
p_src
,
DstDesc
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
SrcOpLengths
,
...
@@ -202,7 +202,7 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
...
@@ -202,7 +202,7 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
Blockwise4dTensorCopy1
struct
Blockwise4dTensorCopy1
{
{
__device__
void
Run
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
constexpr
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
...
@@ -219,7 +219,7 @@ template <unsigned BlockSize,
...
@@ -219,7 +219,7 @@ template <unsigned BlockSize,
class
GlobalLowerPads
>
class
GlobalLowerPads
>
struct
BlockwiseChwnTensorCopyPadded
struct
BlockwiseChwnTensorCopyPadded
{
{
__device__
void
Run
(
Float
*
const
__restrict__
p_src
,
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
unsigned
c_block_data_begin
,
unsigned
c_block_data_begin
,
unsigned
ho_block_data_begin
,
unsigned
ho_block_data_begin
,
unsigned
wo_block_data_begin
,
unsigned
wo_block_data_begin
,
...
@@ -244,7 +244,7 @@ struct BlockwiseChwnTensorCopyPadded
...
@@ -244,7 +244,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
Float
*
const
p_src_tmp
=
const
Float
*
p_src_tmp
=
p_src
+
src_desc
.
Get1dIndex
(
c_block_data_begin
,
p_src
+
src_desc
.
Get1dIndex
(
c_block_data_begin
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
...
...
src/include/device.hpp
0 → 100644
View file @
e80fbbdd
#pragma once
#include <memory>
struct
DeviceMem
{
DeviceMem
()
=
delete
;
DeviceMem
(
std
::
size_t
mem_size
);
void
*
GetDeviceBuffer
();
void
ToDevice
(
const
void
*
p
);
void
FromDevice
(
void
*
p
);
~
DeviceMem
();
void
*
mpDeviceBuf
;
std
::
size_t
mMemSize
;
};
struct
KernelTimerImpl
;
struct
KernelTimer
{
KernelTimer
();
~
KernelTimer
();
void
Start
();
void
End
();
float
GetElapsedTime
()
const
;
std
::
unique_ptr
<
KernelTimerImpl
>
impl
;
};
void
launch_kernel
(
const
void
*
func
,
dim3
grid_dim
,
dim3
block_dim
,
void
**
args
,
float
&
time
);
src/include/gridwise_direct_convolution_1.cuh
View file @
e80fbbdd
...
@@ -19,12 +19,9 @@ template <class Float,
...
@@ -19,12 +19,9 @@ template <class Float,
unsigned
CPerThread
,
unsigned
CPerThread
,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_1
(
InGlobalDesc
,
__global__
void
gridwise_direct_convolution_1
(
const
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
e80fbbdd
...
@@ -21,12 +21,9 @@ template <class Float,
...
@@ -21,12 +21,9 @@ template <class Float,
unsigned
CPerThread
,
unsigned
CPerThread
,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2
(
InGlobalDesc
,
__global__
void
gridwise_direct_convolution_2
(
const
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
e80fbbdd
...
@@ -28,12 +28,9 @@ template <unsigned GridSize,
...
@@ -28,12 +28,9 @@ template <unsigned GridSize,
unsigned
InBlockCopyDataPerRead
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
__global__
void
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn
(
InGlobalDesc
,
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn
(
const
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
{
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
View file @
e80fbbdd
...
@@ -27,10 +27,10 @@ template <unsigned GridSize,
...
@@ -27,10 +27,10 @@ template <unsigned GridSize,
unsigned
WoPerThread
,
unsigned
WoPerThread
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
>
unsigned
WeiBlockCopyThreadPerDim1
>
__global__
void
__global__
void
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
(
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
(
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_out_global
)
{
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
...
@@ -143,7 +143,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
...
@@ -143,7 +143,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
decltype
(
in_chwn_block_desc
.
GetLengths
()),
decltype
(
in_chwn_block_desc
.
GetLengths
()),
LowerPads
>
{};
LowerPads
>
{};
#if
1
#if
0
// weight: format is [C,S,R,K]
// weight: format is [C,S,R,K]
constexpr auto blockwise_wei_copy =
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Blockwise4dTensorCopy1<BlockSize,
...
@@ -151,7 +151,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
...
@@ -151,7 +151,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
decltype(wei_csrk_global_desc),
decltype(wei_csrk_global_desc),
decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{};
decltype(wei_csrk_block_desc.GetLengths())>{};
#elif
1
#elif
0
// weight: format is [C*S*R,K]
// weight: format is [C*S*R,K]
constexpr
auto
blockwise_wei_copy
=
constexpr
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy1
<
BlockSize
,
Blockwise2dTensorCopy1
<
BlockSize
,
...
@@ -216,7 +216,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
...
@@ -216,7 +216,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
Float
*
p_wei_global_block_begin
=
const
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_ek_global_desc
.
Get1dIndex
(
0
,
k_block_data_begin
);
p_wei_global
+
wei_ek_global_desc
.
Get1dIndex
(
0
,
k_block_data_begin
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment