Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a5bcde36
Commit
a5bcde36
authored
Nov 16, 2018
by
Chao Liu
Browse files
refactor
parent
f6934e0b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
40 deletions
+32
-40
src/include/direct_convolution_2.cuh
src/include/direct_convolution_2.cuh
+32
-40
No files found.
src/include/direct_convolution_2.cuh
View file @
a5bcde36
...
@@ -59,16 +59,13 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -59,16 +59,13 @@ __global__ void gridwise_convolution(InGlobalDesc,
constexpr
auto
out_block_src_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_block_src_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
>
{},
out_global_desc
.
GetStrides
());
Sequence
<
NPerBlock
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
>
{},
out_global_desc
.
GetStrides
());
constexpr
auto
in_block_dst_desc
=
constexpr
auto
in_block_desc
=
make_ConstantTensorDescriptor
(
in_block_src_desc
.
GetLengths
());
make_ConstantTensorDescriptor
(
in_block_src_desc
.
GetLengths
());
constexpr
auto
wei_block_desc
=
make_ConstantTensorDescriptor
(
wei_block_src_desc
.
GetLengths
());
constexpr
auto
wei_block_dst_desc
=
constexpr
auto
out_block_desc
=
make_ConstantTensorDescriptor
(
out_block_src_desc
.
GetLengths
());
make_ConstantTensorDescriptor
(
wei_block_src_desc
.
GetLengths
());
constexpr
auto
out_block_dst_desc
=
make_ConstantTensorDescriptor
(
out_block_src_desc
.
GetLengths
());
constexpr
unsigned
in_block_size
=
in_block_
dst_
desc
.
GetElementSpace
();
constexpr
unsigned
in_block_size
=
in_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_block_
dst_
desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_block_desc
.
GetElementSpace
();
constexpr
unsigned
out_block_size
=
out_block_
dst_
desc
.
GetElementSpace
();
constexpr
unsigned
out_block_size
=
out_block_desc
.
GetElementSpace
();
__shared__
TFloat
p_in_block
[
in_block_size
];
__shared__
TFloat
p_in_block
[
in_block_size
];
__shared__
TFloat
p_wei_block
[
wei_block_size
];
__shared__
TFloat
p_wei_block
[
wei_block_size
];
...
@@ -104,9 +101,9 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -104,9 +101,9 @@ __global__ void gridwise_convolution(InGlobalDesc,
print_ConstantTensorDescriptor( in_block_src_desc, "gridwise_convolution: in_block_src_desc: ");
print_ConstantTensorDescriptor( in_block_src_desc, "gridwise_convolution: in_block_src_desc: ");
print_ConstantTensorDescriptor(wei_block_src_desc, "gridwise_convolution: wei_block_src_desc: ");
print_ConstantTensorDescriptor(wei_block_src_desc, "gridwise_convolution: wei_block_src_desc: ");
print_ConstantTensorDescriptor(out_block_src_desc, "gridwise_convolution: out_block_src_desc: ");
print_ConstantTensorDescriptor(out_block_src_desc, "gridwise_convolution: out_block_src_desc: ");
print_ConstantTensorDescriptor( in_block_
dst_
desc, "gridwise_convolution: in_block_
dst_
desc: ");
print_ConstantTensorDescriptor( in_block_desc, "gridwise_convolution: in_block_desc: ");
print_ConstantTensorDescriptor(wei_block_
dst_
desc, "gridwise_convolution: wei_block_
dst_
desc: ");
print_ConstantTensorDescriptor(wei_block_desc, "gridwise_convolution: wei_block_desc: ");
print_ConstantTensorDescriptor(out_block_
dst_
desc, "gridwise_convolution: out_block_
dst_
desc: ");
print_ConstantTensorDescriptor(out_block_desc, "gridwise_convolution: out_block_desc: ");
printf("NBlockWork %u, KBlockWork %u, YBlockWork %u, XBlockWork %u \t"
printf("NBlockWork %u, KBlockWork %u, YBlockWork %u, XBlockWork %u \t"
"block_id %u, n_block_work_id %u, k_block_work_id %u, y_block_work_id %u, "
"block_id %u, n_block_work_id %u, k_block_work_id %u, y_block_work_id %u, "
...
@@ -129,13 +126,13 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -129,13 +126,13 @@ __global__ void gridwise_convolution(InGlobalDesc,
// set output tensor in LDS to 0
// set output tensor in LDS to 0
blockwise_4d_tensor_op_unary
<
TFloat
,
blockwise_4d_tensor_op_unary
<
TFloat
,
decltype
(
out_block_
dst_
desc
),
decltype
(
out_block_desc
),
NBlockOpLen0
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen2
,
NBlockOpLen3
,
NBlockOpLen3
,
decltype
(
f_set0
),
decltype
(
f_set0
),
BlockSize
>
(
out_block_
dst_
desc
,
p_out_block
,
f_set0
);
BlockSize
>
(
out_block_desc
,
p_out_block
,
f_set0
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_global_desc
.
GetLength
(
I1
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_global_desc
.
GetLength
(
I1
);
c_block_work_begin
+=
CPerBlock
)
c_block_work_begin
+=
CPerBlock
)
...
@@ -144,26 +141,26 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -144,26 +141,26 @@ __global__ void gridwise_convolution(InGlobalDesc,
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_op_binary
<
TFloat
,
decltype
(
in_block_src_desc
),
decltype
(
in_block_src_desc
),
decltype
(
in_block_
dst_
desc
),
decltype
(
in_block_desc
),
NBlockOpLen0
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen2
,
NBlockOpLen3
,
NBlockOpLen3
,
decltype
(
f_copy
),
decltype
(
f_copy
),
BlockSize
>
(
BlockSize
>
(
in_block_src_desc
,
in_block_src_desc
,
p_in_global
+
p_in_global
+
in_block_src
_desc
.
Get1dIndex
(
n_block_work_begin
,
in_global
_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
wi_block_work_begin
),
in_block_
dst_
desc
,
in_block_desc
,
p_in_block
,
p_in_block
,
f_copy
);
f_copy
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_op_binary
<
TFloat
,
decltype
(
wei_block_src_desc
),
decltype
(
wei_block_src_desc
),
decltype
(
wei_block_
dst_
desc
),
decltype
(
wei_block_desc
),
NBlockOpLen0
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen2
,
...
@@ -171,9 +168,8 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -171,9 +168,8 @@ __global__ void gridwise_convolution(InGlobalDesc,
decltype
(
f_copy
),
decltype
(
f_copy
),
BlockSize
>
(
BlockSize
>
(
wei_block_src_desc
,
wei_block_src_desc
,
p_wei_global
+
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
wei_block_src_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
wei_block_desc
,
wei_block_dst_desc
,
p_wei_block
,
p_wei_block
,
f_copy
);
f_copy
);
...
@@ -183,17 +179,13 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -183,17 +179,13 @@ __global__ void gridwise_convolution(InGlobalDesc,
// blockwise convolution
// blockwise convolution
blockwise_convolution
<
TFloat
,
blockwise_convolution
<
TFloat
,
decltype
(
in_block_
dst_
desc
),
decltype
(
in_block_desc
),
decltype
(
wei_block_
dst_
desc
),
decltype
(
wei_block_desc
),
decltype
(
out_block_
dst_
desc
),
decltype
(
out_block_desc
),
OutTileSizeH
,
OutTileSizeH
,
OutTileSizeW
,
OutTileSizeW
,
BlockSize
>
(
in_block_dst_desc
,
BlockSize
>
(
p_in_block
,
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
wei_block_dst_desc
,
p_wei_block
,
out_block_dst_desc
,
p_out_block
);
#if 1
#if 1
__syncthreads
();
__syncthreads
();
...
@@ -202,7 +194,7 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -202,7 +194,7 @@ __global__ void gridwise_convolution(InGlobalDesc,
// copy output tensor from LDS to device mem
// copy output tensor from LDS to device mem
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_op_binary
<
TFloat
,
decltype
(
out_block_
dst_
desc
),
decltype
(
out_block_desc
),
decltype
(
out_block_src_desc
),
decltype
(
out_block_src_desc
),
NBlockOpLen0
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen1
,
...
@@ -210,11 +202,11 @@ __global__ void gridwise_convolution(InGlobalDesc,
...
@@ -210,11 +202,11 @@ __global__ void gridwise_convolution(InGlobalDesc,
NBlockOpLen3
,
NBlockOpLen3
,
decltype
(
f_copy
),
decltype
(
f_copy
),
BlockSize
>
(
BlockSize
>
(
out_block_
dst_
desc
,
out_block_desc
,
p_out_block
,
p_out_block
,
out_block_src_desc
,
out_block_src_desc
,
p_out_global
+
p_out_global
+
out_
b
lo
ck_src
_desc
.
Get1dIndex
(
out_
g
lo
bal
_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
f_copy
);
f_copy
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment