Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
bfba60cf
Commit
bfba60cf
authored
Dec 20, 2019
by
Chao Liu
Browse files
tweaking
parent
54ef2c62
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
75 additions
and
32 deletions
+75
-32
driver/CMakeLists.txt
driver/CMakeLists.txt
+6
-6
driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+31
-1
driver/src/conv_bwd_data_driver.cpp
driver/src/conv_bwd_data_driver.cpp
+32
-17
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
script/cmake-cuda_docker.sh
script/cmake-cuda_docker.sh
+2
-4
script/extract_asm-cuda.sh
script/extract_asm-cuda.sh
+3
-3
No files found.
driver/CMakeLists.txt
View file @
bfba60cf
...
@@ -24,9 +24,9 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
...
@@ -24,9 +24,9 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu
)
endif
()
endif
()
add_executable
(
conv
${
CONV_SOURCE
}
)
add_executable
(
conv
_driver
${
CONV_SOURCE
}
)
add_executable
(
col2im
${
COL2IM_SOURCE
}
)
add_executable
(
col2im
_driver
${
COL2IM_SOURCE
}
)
add_executable
(
conv_bwd_data
${
CONV_BWD_DATA_SOURCE
}
)
add_executable
(
conv_bwd_data
_driver
${
CONV_BWD_DATA_SOURCE
}
)
target_link_libraries
(
conv PRIVATE host
)
target_link_libraries
(
conv
_driver
PRIVATE host
)
target_link_libraries
(
col2im PRIVATE host
)
target_link_libraries
(
col2im
_driver
PRIVATE host
)
target_link_libraries
(
conv_bwd_data PRIVATE host
)
target_link_libraries
(
conv_bwd_data
_driver
PRIVATE host
)
driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
View file @
bfba60cf
...
@@ -51,7 +51,7 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
...
@@ -51,7 +51,7 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
// BlockSize = 256, each thread hold 64 data
// BlockSize = 256, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BlockSize = 256;
...
@@ -80,6 +80,36 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
...
@@ -80,6 +80,36 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif
1
// BlockSize = 256, each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmThreadGemmDataPerReadM
=
4
;
constexpr
index_t
GemmThreadGemmDataPerReadN
=
4
;
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
=
Sequence
<
1
,
4
>
;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
=
Sequence
<
8
,
32
>
;
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmM
=
1
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmM
=
1
;
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
1
,
4
>
;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
8
,
32
>
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif 1
#elif 1
// BlockSize = 256, each thread hold 64 data
// BlockSize = 256, each thread hold 64 data
...
...
driver/src/conv_bwd_data_driver.cpp
View file @
bfba60cf
...
@@ -34,6 +34,21 @@ int main(int argc, char* argv[])
...
@@ -34,6 +34,21 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif
1
// 3x3, 28x28
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
...
@@ -109,21 +124,6 @@ int main(int argc, char* argv[])
...
@@ -109,21 +124,6 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
...
@@ -156,7 +156,7 @@ int main(int argc, char* argv[])
...
@@ -156,7 +156,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
1
#elif
0
// 1x7 filter, 0x3 pad, 17x17 input
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
C
=
1024
;
...
@@ -171,6 +171,21 @@ int main(int argc, char* argv[])
...
@@ -171,6 +171,21 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
#elif 1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
1024
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#endif
#endif
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
...
@@ -216,7 +231,7 @@ int main(int argc, char* argv[])
...
@@ -216,7 +231,7 @@ int main(int argc, char* argv[])
#if 0
#if 0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif
1
#elif
0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#else
#else
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
...
...
driver/src/conv_driver.cpp
View file @
bfba60cf
...
@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
...
@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
;
using
namespace
ck
;
#if
0
#if
1
// 1x1
// 1x1
constexpr
index_t
N
=
256
;
constexpr
index_t
N
=
256
;
constexpr
index_t
C
=
1024
;
constexpr
index_t
C
=
1024
;
...
...
script/cmake-cuda_docker.sh
View file @
bfba60cf
...
@@ -15,11 +15,9 @@ cmake
...
@@ -15,11 +15,9 @@ cmake
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/root/NVIDIA_CUDA-10.1_Samples/common/inc"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/root/NVIDIA_CUDA-10.1_Samples/common/inc"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++
-6.0
-m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep
-Xptxas -v -gencode=arch=compute_6
0
,code=sm_6
0
-Xptxas -v -
gencode=arch=compute_70,code=sm_70
"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_6
1
,code=sm_6
1
-Xptxas -v -
maxrregcount=128
"
\
${
MY_PROJECT_SOURCE
}
${
MY_PROJECT_SOURCE
}
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -maxrregcount=128" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70 -Xptxas -v -maxrregcount=128" \
script/extract_asm-cuda.sh
View file @
bfba60cf
cuobjdump
-xelf
sm_60 ./driver/driver
&&
nvdisasm
--print-code
-g
driver.sm_60.cubin
>
driver.sm_60.asm
DRIVER
=
$1
cuobjdump
-xelf
sm_61 ./driver/driver
&&
nvdisasm
--print-code
-g
driver.sm_61.cubin
>
driver.sm_61.asm
ARCH
=
$2
cuobjdump
-xelf
sm_70
./driver/
driver
&&
nvdisasm
--print-code
-g
driver.sm_70.cubin
>
driver.sm_70
.asm
cuobjdump
-xelf
$ARCH
./driver/
$DRIVER
&&
nvdisasm
--print-code
-g
$DRIVER
.
$ARCH
.cubin
>
$DRIVER
.
$ARCH
.asm
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment