Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
adf4b173
Commit
adf4b173
authored
Nov 15, 2018
by
Chao Liu
Browse files
refactor
parent
99d05ba7
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
176 additions
and
172 deletions
+176
-172
driver/conv.cu
driver/conv.cu
+10
-10
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+110
-106
src/include/direct_convolution_2.cuh
src/include/direct_convolution_2.cuh
+56
-56
No files found.
driver/conv.cu
View file @
adf4b173
...
@@ -140,10 +140,10 @@ void device_convolution(
...
@@ -140,10 +140,10 @@ void device_convolution(
constexpr
unsigned
YPerBlock
=
8
;
constexpr
unsigned
YPerBlock
=
8
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
NBlock
Copy
Len0
=
1
;
constexpr
unsigned
NBlock
Op
Len0
=
1
;
constexpr
unsigned
NBlock
Copy
Len1
=
1
;
constexpr
unsigned
NBlock
Op
Len1
=
1
;
constexpr
unsigned
NBlock
Copy
Len2
=
4
;
constexpr
unsigned
NBlock
Op
Len2
=
4
;
constexpr
unsigned
NBlock
Copy
Len3
=
32
;
constexpr
unsigned
NBlock
Op
Len3
=
32
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
...
@@ -174,10 +174,10 @@ void device_convolution(
...
@@ -174,10 +174,10 @@ void device_convolution(
CPerBlock
,
CPerBlock
,
YPerBlock
,
YPerBlock
,
XPerBlock
,
XPerBlock
,
NBlock
Copy
Len0
,
NBlock
Op
Len0
,
NBlock
Copy
Len1
,
NBlock
Op
Len1
,
NBlock
Copy
Len2
,
NBlock
Op
Len2
,
NBlock
Copy
Len3
,
NBlock
Op
Len3
,
BlockSize
,
BlockSize
,
GridSize
>
GridSize
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
...
@@ -248,7 +248,7 @@ int main()
...
@@ -248,7 +248,7 @@ int main()
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
#if
0
#if
1
in
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
out_host
.
GenerateTensorValue
(
GeneratorConstant
<
float
>
{
0
},
num_thread
);
out_host
.
GenerateTensorValue
(
GeneratorConstant
<
float
>
{
0
},
num_thread
);
...
@@ -258,7 +258,7 @@ int main()
...
@@ -258,7 +258,7 @@ int main()
device_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#if
0
#if
1
host_convolution
(
in
,
wei
,
out_host
);
host_convolution
(
in
,
wei
,
out_host
);
float
error
=
0
;
float
error
=
0
;
...
...
src/include/blockwise_tensor_op.cuh
View file @
adf4b173
...
@@ -7,10 +7,10 @@
...
@@ -7,10 +7,10 @@
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
unsigned
N
Work
Len0
,
unsigned
N
BlockOp
Len0
,
unsigned
N
Work
Len1
,
unsigned
N
BlockOp
Len1
,
unsigned
N
Work
Len2
,
unsigned
N
BlockOp
Len2
,
unsigned
N
Work
Len3
,
unsigned
N
BlockOp
Len3
,
class
F
,
class
F
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
__device__
void
blockwise_4d_tensor_op_binary
(
...
@@ -67,10 +67,10 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -67,10 +67,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
unsigned
N
Work
Len0
,
unsigned
N
BlockOp
Len0
,
unsigned
N
Work
Len1
,
unsigned
N
BlockOp
Len1
,
unsigned
N
Work
Len2
,
unsigned
N
BlockOp
Len2
,
unsigned
N
Work
Len3
,
unsigned
N
BlockOp
Len3
,
class
F
,
class
F
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
__device__
void
blockwise_4d_tensor_op_binary
(
...
@@ -156,10 +156,10 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -156,10 +156,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template
<
class
TFloat
,
template
<
class
TFloat
,
class
DstDesc
,
class
DstDesc
,
unsigned
N
Work
Len0
,
unsigned
N
BlockOp
Len0
,
unsigned
N
Work
Len1
,
unsigned
N
BlockOp
Len1
,
unsigned
N
Work
Len2
,
unsigned
N
BlockOp
Len2
,
unsigned
N
Work
Len3
,
unsigned
N
BlockOp
Len3
,
class
F
,
class
F
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
__device__
void
blockwise_4d_tensor_op_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
...
@@ -240,10 +240,10 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
...
@@ -240,10 +240,10 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
unsigned
N
Work
Len0
,
unsigned
N
BlockOp
Len0
,
unsigned
N
Work
Len1
,
unsigned
N
BlockOp
Len1
,
unsigned
N
Work
Len2
,
unsigned
N
BlockOp
Len2
,
unsigned
N
Work
Len3
,
unsigned
N
BlockOp
Len3
,
class
F
,
class
F
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
__device__
void
blockwise_4d_tensor_op_binary
(
...
@@ -259,34 +259,34 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -259,34 +259,34 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
N
Work
Stride3
=
1
;
constexpr
unsigned
N
BlockOp
Stride3
=
1
;
constexpr
unsigned
N
Work
Stride2
=
N
Work
Len3
*
N
Work
Stride3
;
constexpr
unsigned
N
BlockOp
Stride2
=
N
BlockOp
Len3
*
N
BlockOp
Stride3
;
constexpr
unsigned
N
Work
Stride1
=
N
Work
Len2
*
N
Work
Stride2
;
constexpr
unsigned
N
BlockOp
Stride1
=
N
BlockOp
Len2
*
N
BlockOp
Stride2
;
constexpr
unsigned
N
Work
Stride0
=
N
Work
Len1
*
N
Work
Stride1
;
constexpr
unsigned
N
BlockOp
Stride0
=
N
BlockOp
Len1
*
N
BlockOp
Stride1
;
unsigned
itmp
=
threadIdx
.
x
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
N
Work
Stride0
;
const
unsigned
did0_begin
=
itmp
/
N
BlockOp
Stride0
;
itmp
-=
did0_begin
*
N
Work
Stride0
;
itmp
-=
did0_begin
*
N
BlockOp
Stride0
;
const
unsigned
did1_begin
=
itmp
/
N
Work
Stride1
;
const
unsigned
did1_begin
=
itmp
/
N
BlockOp
Stride1
;
itmp
-=
did1_begin
*
N
Work
Stride1
;
itmp
-=
did1_begin
*
N
BlockOp
Stride1
;
const
unsigned
did2_begin
=
itmp
/
N
Work
Stride2
;
const
unsigned
did2_begin
=
itmp
/
N
BlockOp
Stride2
;
itmp
-=
did2_begin
*
N
Work
Stride2
;
itmp
-=
did2_begin
*
N
BlockOp
Stride2
;
const
unsigned
did3_begin
=
itmp
/
N
Work
Stride3
;
const
unsigned
did3_begin
=
itmp
/
N
BlockOp
Stride3
;
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
Work
Len0
)
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
BlockOp
Len0
)
{
{
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
Work
Len1
)
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
BlockOp
Len1
)
{
{
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
Work
Len2
)
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
BlockOp
Len2
)
{
{
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
Work
Len3
)
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
BlockOp
Len3
)
{
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
...
@@ -304,10 +304,10 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -304,10 +304,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
unsigned
N
Work
Len0
,
unsigned
N
BlockOp
Len0
,
unsigned
N
Work
Len1
,
unsigned
N
BlockOp
Len1
,
unsigned
N
Work
Len2
,
unsigned
N
BlockOp
Len2
,
unsigned
N
Work
Len3
,
unsigned
N
BlockOp
Len3
,
class
F
,
class
F
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
__device__
void
blockwise_4d_tensor_op_binary
(
...
@@ -323,63 +323,63 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -323,63 +323,63 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
N
Work
Stride3
=
1
;
constexpr
unsigned
N
BlockOp
Stride3
=
1
;
constexpr
unsigned
N
Work
Stride2
=
N
Work
Len3
*
N
Work
Stride3
;
constexpr
unsigned
N
BlockOp
Stride2
=
N
BlockOp
Len3
*
N
BlockOp
Stride3
;
constexpr
unsigned
N
Work
Stride1
=
N
Work
Len2
*
N
Work
Stride2
;
constexpr
unsigned
N
BlockOp
Stride1
=
N
BlockOp
Len2
*
N
BlockOp
Stride2
;
constexpr
unsigned
N
Work
Stride0
=
N
Work
Len1
*
N
Work
Stride1
;
constexpr
unsigned
N
BlockOp
Stride0
=
N
BlockOp
Len1
*
N
BlockOp
Stride1
;
unsigned
itmp
=
threadIdx
.
x
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
N
Work
Stride0
;
const
unsigned
did0_begin
=
itmp
/
N
BlockOp
Stride0
;
itmp
-=
did0_begin
*
N
Work
Stride0
;
itmp
-=
did0_begin
*
N
BlockOp
Stride0
;
const
unsigned
did1_begin
=
itmp
/
N
Work
Stride1
;
const
unsigned
did1_begin
=
itmp
/
N
BlockOp
Stride1
;
itmp
-=
did1_begin
*
N
Work
Stride1
;
itmp
-=
did1_begin
*
N
BlockOp
Stride1
;
const
unsigned
did2_begin
=
itmp
/
N
Work
Stride2
;
const
unsigned
did2_begin
=
itmp
/
N
BlockOp
Stride2
;
itmp
-=
did2_begin
*
N
Work
Stride2
;
itmp
-=
did2_begin
*
N
BlockOp
Stride2
;
const
unsigned
did3_begin
=
itmp
/
N
Work
Stride3
;
const
unsigned
did3_begin
=
itmp
/
N
BlockOp
Stride3
;
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
Work
Len0
)
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
BlockOp
Len0
)
{
{
const
unsigned
sindex_save0
=
sindex
;
const
unsigned
sindex_save0
=
sindex
;
const
unsigned
dindex_save0
=
dindex
;
const
unsigned
dindex_save0
=
dindex
;
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
Work
Len1
)
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
BlockOp
Len1
)
{
{
const
unsigned
sindex_save1
=
sindex
;
const
unsigned
sindex_save1
=
sindex
;
const
unsigned
dindex_save1
=
dindex
;
const
unsigned
dindex_save1
=
dindex
;
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
Work
Len2
)
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
BlockOp
Len2
)
{
{
const
unsigned
sindex_save2
=
sindex
;
const
unsigned
sindex_save2
=
sindex
;
const
unsigned
dindex_save2
=
dindex
;
const
unsigned
dindex_save2
=
dindex
;
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
Work
Len3
)
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
BlockOp
Len3
)
{
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
sindex
+=
N
Work
Len3
*
src_desc
.
GetStride
(
I3
);
sindex
+=
N
BlockOp
Len3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
N
Work
Len3
*
dst_desc
.
GetStride
(
I3
);
dindex
+=
N
BlockOp
Len3
*
dst_desc
.
GetStride
(
I3
);
}
}
sindex
=
sindex_save2
+
N
Work
Len2
*
src_desc
.
GetStride
(
I2
);
sindex
=
sindex_save2
+
N
BlockOp
Len2
*
src_desc
.
GetStride
(
I2
);
dindex
=
dindex_save2
+
N
Work
Len2
*
dst_desc
.
GetStride
(
I2
);
dindex
=
dindex_save2
+
N
BlockOp
Len2
*
dst_desc
.
GetStride
(
I2
);
}
}
sindex
=
sindex_save1
+
N
Work
Len1
*
src_desc
.
GetStride
(
I1
);
sindex
=
sindex_save1
+
N
BlockOp
Len1
*
src_desc
.
GetStride
(
I1
);
dindex
=
dindex_save1
+
N
Work
Len1
*
dst_desc
.
GetStride
(
I1
);
dindex
=
dindex_save1
+
N
BlockOp
Len1
*
dst_desc
.
GetStride
(
I1
);
}
}
sindex
=
sindex_save0
+
N
Work
Len0
*
src_desc
.
GetStride
(
I0
);
sindex
=
sindex_save0
+
N
BlockOp
Len0
*
src_desc
.
GetStride
(
I0
);
dindex
=
dindex_save0
+
N
Work
Len0
*
dst_desc
.
GetStride
(
I0
);
dindex
=
dindex_save0
+
N
BlockOp
Len0
*
dst_desc
.
GetStride
(
I0
);
}
}
}
}
#endif
#endif
...
@@ -388,10 +388,10 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -388,10 +388,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
unsigned
N
Work
Len0
,
unsigned
N
BlockOp
Len0
,
unsigned
N
Work
Len1
,
unsigned
N
BlockOp
Len1
,
unsigned
N
Work
Len2
,
unsigned
N
BlockOp
Len2
,
unsigned
N
Work
Len3
,
unsigned
N
BlockOp
Len3
,
class
F
,
class
F
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
__device__
void
blockwise_4d_tensor_op_binary
(
...
@@ -407,65 +407,69 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -407,65 +407,69 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
N
Work
Stride3
=
1
;
constexpr
unsigned
N
BlockOp
Stride3
=
1
;
constexpr
unsigned
N
Work
Stride2
=
N
Work
Len3
*
N
Work
Stride3
;
constexpr
unsigned
N
BlockOp
Stride2
=
N
BlockOp
Len3
*
N
BlockOp
Stride3
;
constexpr
unsigned
N
Work
Stride1
=
N
Work
Len2
*
N
Work
Stride2
;
constexpr
unsigned
N
BlockOp
Stride1
=
N
BlockOp
Len2
*
N
BlockOp
Stride2
;
constexpr
unsigned
N
Work
Stride0
=
N
Work
Len1
*
N
Work
Stride1
;
constexpr
unsigned
N
BlockOp
Stride0
=
N
BlockOp
Len1
*
N
BlockOp
Stride1
;
unsigned
itmp
=
threadIdx
.
x
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
N
Work
Stride0
;
const
unsigned
did0_begin
=
itmp
/
N
BlockOp
Stride0
;
itmp
-=
did0_begin
*
N
Work
Stride0
;
itmp
-=
did0_begin
*
N
BlockOp
Stride0
;
const
unsigned
did1_begin
=
itmp
/
N
Work
Stride1
;
const
unsigned
did1_begin
=
itmp
/
N
BlockOp
Stride1
;
itmp
-=
did1_begin
*
N
Work
Stride1
;
itmp
-=
did1_begin
*
N
BlockOp
Stride1
;
const
unsigned
did2_begin
=
itmp
/
N
Work
Stride2
;
const
unsigned
did2_begin
=
itmp
/
N
BlockOp
Stride2
;
itmp
-=
did2_begin
*
N
Work
Stride2
;
itmp
-=
did2_begin
*
N
BlockOp
Stride2
;
const
unsigned
did3_begin
=
itmp
/
N
Work
Stride3
;
const
unsigned
did3_begin
=
itmp
/
N
BlockOp
Stride3
;
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
Work
Len0
)
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
BlockOp
Len0
)
{
{
unsigned
i1
=
0
;
unsigned
i1
=
0
;
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
Work
Len1
)
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
BlockOp
Len1
)
{
{
unsigned
i2
=
0
;
unsigned
i2
=
0
;
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
Work
Len2
)
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
BlockOp
Len2
)
{
{
unsigned
i3
=
0
;
unsigned
i3
=
0
;
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
Work
Len3
)
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
BlockOp
Len3
)
{
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
sindex
+=
N
Work
Len3
*
src_desc
.
GetStride
(
I3
);
sindex
+=
N
BlockOp
Len3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
N
Work
Len3
*
dst_desc
.
GetStride
(
I3
);
dindex
+=
N
BlockOp
Len3
*
dst_desc
.
GetStride
(
I3
);
++
i3
;
++
i3
;
}
}
sindex
+=
sindex
+=
NBlockOpLen2
*
src_desc
.
GetStride
(
I2
)
-
NWorkLen2
*
src_desc
.
GetStride
(
I2
)
-
i3
*
NWork
Len3
*
src_desc
.
GetStride
(
I3
);
i3
*
NBlockOp
Len3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
dindex
+=
NBlockOpLen2
*
dst_desc
.
GetStride
(
I2
)
-
NWorkLen2
*
dst_desc
.
GetStride
(
I2
)
-
i3
*
NWork
Len3
*
dst_desc
.
GetStride
(
I3
);
i3
*
NBlockOp
Len3
*
dst_desc
.
GetStride
(
I3
);
++
i2
;
++
i2
;
}
}
sindex
+=
NWorkLen1
*
src_desc
.
GetStride
(
I1
)
-
i2
*
NWorkLen2
*
src_desc
.
GetStride
(
I2
);
sindex
+=
dindex
+=
NWorkLen1
*
dst_desc
.
GetStride
(
I1
)
-
i2
*
NWorkLen2
*
dst_desc
.
GetStride
(
I2
);
NBlockOpLen1
*
src_desc
.
GetStride
(
I1
)
-
i2
*
NBlockOpLen2
*
src_desc
.
GetStride
(
I2
);
dindex
+=
NBlockOpLen1
*
dst_desc
.
GetStride
(
I1
)
-
i2
*
NBlockOpLen2
*
dst_desc
.
GetStride
(
I2
);
++
i1
;
++
i1
;
}
}
sindex
+=
NWorkLen0
*
src_desc
.
GetStride
(
I0
)
-
i1
*
NWorkLen1
*
src_desc
.
GetStride
(
I1
);
sindex
+=
dindex
+=
NWorkLen0
*
dst_desc
.
GetStride
(
I0
)
-
i1
*
NWorkLen1
*
dst_desc
.
GetStride
(
I1
);
NBlockOpLen0
*
src_desc
.
GetStride
(
I0
)
-
i1
*
NBlockOpLen1
*
src_desc
.
GetStride
(
I1
);
dindex
+=
NBlockOpLen0
*
dst_desc
.
GetStride
(
I0
)
-
i1
*
NBlockOpLen1
*
dst_desc
.
GetStride
(
I1
);
}
}
}
}
#endif
#endif
...
@@ -474,10 +478,10 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -474,10 +478,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
unsigned
N
Work
Len0
,
unsigned
N
BlockOp
Len0
,
unsigned
N
Work
Len1
,
unsigned
N
BlockOp
Len1
,
unsigned
N
Work
Len2
,
unsigned
N
BlockOp
Len2
,
unsigned
N
Work
Len3
,
unsigned
N
BlockOp
Len3
,
class
F
,
class
F
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
__device__
void
blockwise_4d_tensor_op_binary
(
...
@@ -493,34 +497,34 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -493,34 +497,34 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
N
Work
Stride3
=
1
;
constexpr
unsigned
N
BlockOp
Stride3
=
1
;
constexpr
unsigned
N
Work
Stride2
=
N
Work
Len3
*
N
Work
Stride3
;
constexpr
unsigned
N
BlockOp
Stride2
=
N
BlockOp
Len3
*
N
BlockOp
Stride3
;
constexpr
unsigned
N
Work
Stride1
=
N
Work
Len2
*
N
Work
Stride2
;
constexpr
unsigned
N
BlockOp
Stride1
=
N
BlockOp
Len2
*
N
BlockOp
Stride2
;
constexpr
unsigned
N
Work
Stride0
=
N
Work
Len1
*
N
Work
Stride1
;
constexpr
unsigned
N
BlockOp
Stride0
=
N
BlockOp
Len1
*
N
BlockOp
Stride1
;
unsigned
itmp
=
threadIdx
.
x
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
N
Work
Stride0
;
const
unsigned
did0_begin
=
itmp
/
N
BlockOp
Stride0
;
itmp
-=
did0_begin
*
N
Work
Stride0
;
itmp
-=
did0_begin
*
N
BlockOp
Stride0
;
const
unsigned
did1_begin
=
itmp
/
N
Work
Stride1
;
const
unsigned
did1_begin
=
itmp
/
N
BlockOp
Stride1
;
itmp
-=
did1_begin
*
N
Work
Stride1
;
itmp
-=
did1_begin
*
N
BlockOp
Stride1
;
const
unsigned
did2_begin
=
itmp
/
N
Work
Stride2
;
const
unsigned
did2_begin
=
itmp
/
N
BlockOp
Stride2
;
itmp
-=
did2_begin
*
N
Work
Stride2
;
itmp
-=
did2_begin
*
N
BlockOp
Stride2
;
const
unsigned
did3_begin
=
itmp
/
N
Work
Stride3
;
const
unsigned
did3_begin
=
itmp
/
N
BlockOp
Stride3
;
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
Work
Len0
)
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
N
BlockOp
Len0
)
{
{
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
Work
Len1
)
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
N
BlockOp
Len1
)
{
{
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
Work
Len2
)
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
N
BlockOp
Len2
)
{
{
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
Work
Len3
)
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
N
BlockOp
Len3
)
{
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
...
...
src/include/direct_convolution_2.cuh
View file @
adf4b173
...
@@ -12,11 +12,11 @@ template <class TFloat,
...
@@ -12,11 +12,11 @@ template <class TFloat,
unsigned
OutTileSizeW
,
unsigned
OutTileSizeW
,
unsigned
BlockSize
>
unsigned
BlockSize
>
__device__
void
blockwise_convolution
(
InDesc
,
__device__
void
blockwise_convolution
(
InDesc
,
TFloat
*
const
__restrict__
p_in
,
TFloat
*
const
__restrict__
p_in
_lds
,
WeiDesc
,
WeiDesc
,
TFloat
*
const
__restrict__
p_wei
,
TFloat
*
const
__restrict__
p_wei
_lds
,
OutDesc
,
OutDesc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
_lds
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
...
@@ -97,8 +97,8 @@ __device__ void blockwise_convolution(InDesc,
...
@@ -97,8 +97,8 @@ __device__ void blockwise_convolution(InDesc,
decltype
(
in_thread_dst_desc
),
decltype
(
in_thread_dst_desc
),
decltype
(
f_copy
)
>
(
decltype
(
f_copy
)
>
(
in_thread_src_desc
,
in_thread_src_desc
,
p_in
+
in_desc
.
Get1dIndex
(
p_in
_lds
+
in_desc
.
Get1dIndex
(
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
in_thread_dst_desc
,
in_thread_dst_desc
,
p_in_thread
,
p_in_thread
,
f_copy
);
f_copy
);
...
@@ -112,7 +112,7 @@ __device__ void blockwise_convolution(InDesc,
...
@@ -112,7 +112,7 @@ __device__ void blockwise_convolution(InDesc,
decltype
(
wei_thread_dst_desc
),
decltype
(
wei_thread_dst_desc
),
decltype
(
f_copy
)
>
(
decltype
(
f_copy
)
>
(
wei_thread_src_desc
,
wei_thread_src_desc
,
p_wei
+
wei_desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
p_wei
_lds
+
wei_desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
wei_thread_dst_desc
,
wei_thread_dst_desc
,
p_wei_thread
,
p_wei_thread
,
f_copy
);
f_copy
);
...
@@ -123,10 +123,10 @@ __device__ void blockwise_convolution(InDesc,
...
@@ -123,10 +123,10 @@ __device__ void blockwise_convolution(InDesc,
decltype
(
out_thread_dst_desc
),
decltype
(
out_thread_dst_desc
),
decltype
(
f_copy
)
>
(
decltype
(
f_copy
)
>
(
out_thread_src_desc
,
out_thread_src_desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
p_out
_lds
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
),
wo_thread_work_begin
),
out_thread_dst_desc
,
out_thread_dst_desc
,
p_out_thread
,
p_out_thread
,
f_copy
);
f_copy
);
...
@@ -150,10 +150,10 @@ __device__ void blockwise_convolution(InDesc,
...
@@ -150,10 +150,10 @@ __device__ void blockwise_convolution(InDesc,
out_thread_dst_desc
,
out_thread_dst_desc
,
p_out_thread
,
p_out_thread
,
out_thread_src_desc
,
out_thread_src_desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
p_out
_lds
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
),
wo_thread_work_begin
),
f_copy
);
f_copy
);
}
}
}
}
...
@@ -170,18 +170,18 @@ template <class TFloat,
...
@@ -170,18 +170,18 @@ template <class TFloat,
unsigned
CPerBlock
,
unsigned
CPerBlock
,
unsigned
YPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
XPerBlock
,
unsigned
NBlock
Copy
Len0
,
unsigned
NBlock
Op
Len0
,
unsigned
NBlock
Copy
Len1
,
unsigned
NBlock
Op
Len1
,
unsigned
NBlock
Copy
Len2
,
unsigned
NBlock
Op
Len2
,
unsigned
NBlock
Copy
Len3
,
unsigned
NBlock
Op
Len3
,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_convolution
(
InDesc
,
__global__
void
gridwise_convolution
(
InDesc
,
TFloat
*
const
__restrict__
p_in
,
TFloat
*
const
__restrict__
p_in
_glb
,
WeiDesc
,
WeiDesc
,
TFloat
*
const
__restrict__
p_wei
,
TFloat
*
const
__restrict__
p_wei
_glb
,
OutDesc
,
OutDesc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
_glb
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
...
@@ -222,13 +222,13 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -222,13 +222,13 @@ __global__ void gridwise_convolution(InDesc,
constexpr
auto
out_block_lds_desc
=
constexpr
auto
out_block_lds_desc
=
make_ConstantTensorDescriptor
(
out_block_glb_desc
.
GetLengths
());
make_ConstantTensorDescriptor
(
out_block_glb_desc
.
GetLengths
());
constexpr
unsigned
in_block_size
=
in_block_lds_desc
.
GetElementS
iz
e
();
constexpr
unsigned
in_block_size
=
in_block_lds_desc
.
GetElementS
pac
e
();
constexpr
unsigned
wei_block_size
=
wei_block_lds_desc
.
GetElementS
iz
e
();
constexpr
unsigned
wei_block_size
=
wei_block_lds_desc
.
GetElementS
pac
e
();
constexpr
unsigned
out_block_size
=
out_block_lds_desc
.
GetElementS
iz
e
();
constexpr
unsigned
out_block_size
=
out_block_lds_desc
.
GetElementS
pac
e
();
__shared__
TFloat
p_in_block
[
in_block_size
];
__shared__
TFloat
p_in_block
_lds
[
in_block_size
];
__shared__
TFloat
p_wei_block
[
wei_block_size
];
__shared__
TFloat
p_wei_block
_lds
[
wei_block_size
];
__shared__
TFloat
p_out_block
[
out_block_size
];
__shared__
TFloat
p_out_block
_lds
[
out_block_size
];
const
unsigned
block_id
=
blockIdx
.
x
;
const
unsigned
block_id
=
blockIdx
.
x
;
...
@@ -286,12 +286,12 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -286,12 +286,12 @@ __global__ void gridwise_convolution(InDesc,
// set output tensor in LDS to 0
// set output tensor in LDS to 0
blockwise_4d_tensor_op_unary
<
TFloat
,
blockwise_4d_tensor_op_unary
<
TFloat
,
decltype
(
out_block_lds_desc
),
decltype
(
out_block_lds_desc
),
NBlock
Copy
Len0
,
NBlock
Op
Len0
,
NBlock
Copy
Len1
,
NBlock
Op
Len1
,
NBlock
Copy
Len2
,
NBlock
Op
Len2
,
NBlock
Copy
Len3
,
NBlock
Op
Len3
,
decltype
(
f_set0
),
decltype
(
f_set0
),
BlockSize
>
(
out_block_lds_desc
,
p_out_block
,
f_set0
);
BlockSize
>
(
out_block_lds_desc
,
p_out_block
_lds
,
f_set0
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_desc
.
GetLength
(
I1
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_desc
.
GetLength
(
I1
);
c_block_work_begin
+=
CPerBlock
)
c_block_work_begin
+=
CPerBlock
)
...
@@ -301,35 +301,35 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -301,35 +301,35 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_op_binary
<
TFloat
,
decltype
(
in_block_glb_desc
),
decltype
(
in_block_glb_desc
),
decltype
(
in_block_lds_desc
),
decltype
(
in_block_lds_desc
),
NBlock
Copy
Len0
,
NBlock
Op
Len0
,
NBlock
Copy
Len1
,
NBlock
Op
Len1
,
NBlock
Copy
Len2
,
NBlock
Op
Len2
,
NBlock
Copy
Len3
,
NBlock
Op
Len3
,
decltype
(
f_copy
),
decltype
(
f_copy
),
BlockSize
>
(
BlockSize
>
(
in_block_glb_desc
,
in_block_glb_desc
,
p_in
+
in_block_glb_desc
.
Get1dIndex
(
n_block_work_begin
,
p_in
_glb
+
in_block_glb_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
wi_block_work_begin
),
in_block_lds_desc
,
in_block_lds_desc
,
p_in_block
,
p_in_block
_lds
,
f_copy
);
f_copy
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_op_binary
<
TFloat
,
decltype
(
wei_block_glb_desc
),
decltype
(
wei_block_glb_desc
),
decltype
(
wei_block_lds_desc
),
decltype
(
wei_block_lds_desc
),
NBlock
Copy
Len0
,
NBlock
Op
Len0
,
NBlock
Copy
Len1
,
NBlock
Op
Len1
,
NBlock
Copy
Len2
,
NBlock
Op
Len2
,
NBlock
Copy
Len3
,
NBlock
Op
Len3
,
decltype
(
f_copy
),
decltype
(
f_copy
),
BlockSize
>
(
BlockSize
>
(
wei_block_glb_desc
,
wei_block_glb_desc
,
p_wei
+
wei_block_glb_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
p_wei
_glb
+
wei_block_glb_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
wei_block_lds_desc
,
wei_block_lds_desc
,
p_wei_block
,
p_wei_block
_lds
,
f_copy
);
f_copy
);
#if 1
#if 1
...
@@ -344,11 +344,11 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -344,11 +344,11 @@ __global__ void gridwise_convolution(InDesc,
OutTileSizeH
,
OutTileSizeH
,
OutTileSizeW
,
OutTileSizeW
,
BlockSize
>
(
in_block_lds_desc
,
BlockSize
>
(
in_block_lds_desc
,
p_in_block
,
p_in_block
_lds
,
wei_block_lds_desc
,
wei_block_lds_desc
,
p_wei_block
,
p_wei_block
_lds
,
out_block_lds_desc
,
out_block_lds_desc
,
p_out_block
);
p_out_block
_lds
);
#if 1
#if 1
__syncthreads
();
__syncthreads
();
...
@@ -359,16 +359,16 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -359,16 +359,16 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_op_binary
<
TFloat
,
decltype
(
out_block_lds_desc
),
decltype
(
out_block_lds_desc
),
decltype
(
out_block_glb_desc
),
decltype
(
out_block_glb_desc
),
NBlock
Copy
Len0
,
NBlock
Op
Len0
,
NBlock
Copy
Len1
,
NBlock
Op
Len1
,
NBlock
Copy
Len2
,
NBlock
Op
Len2
,
NBlock
Copy
Len3
,
NBlock
Op
Len3
,
decltype
(
f_copy
),
decltype
(
f_copy
),
BlockSize
>
(
BlockSize
>
(
out_block_lds_desc
,
out_block_lds_desc
,
p_out_block
,
p_out_block
_lds
,
out_block_glb_desc
,
out_block_glb_desc
,
p_out
+
p_out
_glb
+
out_block_glb_desc
.
Get1dIndex
(
out_block_glb_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
f_copy
);
f_copy
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment