Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9657baec
Commit
9657baec
authored
Nov 02, 2018
by
Chao Liu
Browse files
initial direct conv correct run
parent
dfa02139
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
327 additions
and
94 deletions
+327
-94
driver/conv.cu
driver/conv.cu
+43
-21
src/include/device_tensor.cuh
src/include/device_tensor.cuh
+3
-4
src/include/direct_convolution.cuh
src/include/direct_convolution.cuh
+281
-69
No files found.
driver/conv.cu
View file @
9657baec
...
@@ -7,21 +7,35 @@
...
@@ -7,21 +7,35 @@
#include "direct_convolution.cuh"
#include "direct_convolution.cuh"
template
<
class
T
>
template
<
class
T
>
struct
Generator
struct
Generator
Constant
{
{
T
value
=
0
;
T
value
=
0
;
template
<
class
...
Is
>
template
<
class
...
Is
>
T
operator
()(
Is
...
is
)
T
operator
()(
Is
...
is
)
{
{
#if 0
return
value
;
return
value
;
#else
}
};
template
<
class
T
>
struct
GeneratorTensor
{
template
<
class
...
Is
>
T
operator
()(
Is
...
is
)
{
#if 0
std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
#else
assert
(
sizeof
...(
Is
)
>
0
);
std
::
initializer_list
<
std
::
size_t
>
ids
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
vector
<
std
::
size_t
>
lens
(
sizeof
...(
Is
),
100
);
std
::
vector
<
std
::
size_t
>
strides
(
sizeof
...(
Is
),
1
);
std
::
partial_sum
(
lens
.
rbegin
(),
lens
.
rbegin
()
+
(
sizeof
...(
Is
)
-
1
),
strides
.
rbegin
()
+
1
);
return
std
::
inner_product
(
ids
.
begin
(),
ids
.
end
(),
strides
.
begin
(),
std
::
size_t
(
0
))
+
1
;
#endif
#endif
}
}
};
};
template
<
typename
T
>
template
<
typename
T
>
...
@@ -57,18 +71,22 @@ void host_convolution(const Tensor<T>& in,
...
@@ -57,18 +71,22 @@ void host_convolution(const Tensor<T>& in,
}
}
template
<
class
T
>
template
<
class
T
>
void
device_convolution
(
Tensor
<
T
>&
in
,
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
void
device_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
{
{
DeviceTensorDescriptor
<
4
>
in_desc_device
(
in
.
mDesc
);
DeviceTensorDescriptor
<
4
>
in_desc_device
(
in
.
mDesc
);
DeviceTensorDescriptor
<
4
>
wei_desc_device
(
wei
.
mDesc
);
DeviceTensorDescriptor
<
4
>
wei_desc_device
(
wei
.
mDesc
);
DeviceTensorDescriptor
<
4
>
out_desc_device
(
out
.
mDesc
);
DeviceTensorDescriptor
<
4
>
out_desc_device
(
out
.
mDesc
);
printf
(
"__func__: in_desc_device: %u %u %u %u
\n
"
,
printf
(
"__func__: in_desc_device:
{
%u %u %u %u
}, {%u %u %u %u}
\n
"
,
in_desc_device
.
GetLength
(
0
),
in_desc_device
.
GetLength
(
0
),
in_desc_device
.
GetLength
(
1
),
in_desc_device
.
GetLength
(
1
),
in_desc_device
.
GetLength
(
2
),
in_desc_device
.
GetLength
(
2
),
in_desc_device
.
GetLength
(
3
));
in_desc_device
.
GetLength
(
3
),
in_desc_device
.
GetStride
(
0
),
in_desc_device
.
GetStride
(
1
),
in_desc_device
.
GetStride
(
2
),
in_desc_device
.
GetStride
(
3
));
std
::
size_t
data_sz
=
sizeof
(
T
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
...
@@ -77,11 +95,7 @@ void device_convolution(Tensor<T>& in, Tensor<T>& wei, Tensor<T>& out)
...
@@ -77,11 +95,7 @@ void device_convolution(Tensor<T>& in, Tensor<T>& wei, Tensor<T>& out)
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
#if 1
out
.
GenerateTensorValue
(
GeneratorConstant
<
float
>
{
0
},
num_thread
);
in
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
#endif
out
.
GenerateTensorValue
(
Generator
<
float
>
{
0
},
num_thread
);
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
...
@@ -89,7 +103,7 @@ void device_convolution(Tensor<T>& in, Tensor<T>& wei, Tensor<T>& out)
...
@@ -89,7 +103,7 @@ void device_convolution(Tensor<T>& in, Tensor<T>& wei, Tensor<T>& out)
dim3
block_dim
(
64
,
1
,
1
);
dim3
block_dim
(
64
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
gridwise_convolution
<
T
,
3
,
3
,
4
,
4
,
2
,
2
,
1
,
1
,
32
,
32
,
1
>
gridwise_convolution
<
T
,
3
,
3
,
4
,
4
,
2
,
2
,
1
,
1
,
8
,
8
,
1
>
<<<
grid_dim
,
block_dim
>>>
(
in_desc_device
,
<<<
grid_dim
,
block_dim
>>>
(
in_desc_device
,
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
wei_desc_device
,
wei_desc_device
,
...
@@ -97,6 +111,7 @@ void device_convolution(Tensor<T>& in, Tensor<T>& wei, Tensor<T>& out)
...
@@ -97,6 +111,7 @@ void device_convolution(Tensor<T>& in, Tensor<T>& wei, Tensor<T>& out)
out_desc_device
,
out_desc_device
,
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
}
...
@@ -111,9 +126,9 @@ int main()
...
@@ -111,9 +126,9 @@ int main()
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
out_host
({
1
,
1
,
128
,
128
});
Tensor
<
float
>
out_host
({
1
,
1
,
128
,
128
});
#elif 1
#elif 1
Tensor
<
float
>
in
({
1
,
1
,
18
,
18
});
Tensor
<
float
>
in
({
1
,
1
,
18
,
18
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
out_host
({
1
,
1
,
16
,
16
});
Tensor
<
float
>
out_host
({
1
,
1
,
16
,
16
});
#else
#else
Tensor
<
float
>
in
({
1
,
1
,
4
,
4
});
Tensor
<
float
>
in
({
1
,
1
,
4
,
4
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
...
@@ -125,16 +140,23 @@ int main()
...
@@ -125,16 +140,23 @@ int main()
std
::
cout
<<
__func__
<<
": num_thread "
<<
num_thread
<<
std
::
endl
;
std
::
cout
<<
__func__
<<
": num_thread "
<<
num_thread
<<
std
::
endl
;
in
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
in
.
GenerateTensorValue
(
Generator
Tensor
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{
1
},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
Tensor
<
float
>
{},
num_thread
);
//
host_convolution(in, wei, out_host, num_thread);
host_convolution
(
in
,
wei
,
out_host
,
num_thread
);
device_convolution
(
in
,
wei
,
out_device
);
device_convolution
(
in
,
wei
,
out_device
);
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
LogRange
(
std
::
cout
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
__func__
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
__func__
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
//
LogRange(std::cout, out_host.mData, ",") << std::endl;
LogRange
(
std
::
cout
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
float
error
=
0
;
for
(
int
i
=
0
;
i
<
out_host
.
mData
.
size
();
++
i
)
{
error
+=
std
::
abs
(
out_host
.
mData
[
i
]
-
out_device
.
mData
[
i
]);
}
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
}
}
src/include/device_tensor.cuh
View file @
9657baec
...
@@ -17,15 +17,14 @@ struct DeviceTensorDescriptor
...
@@ -17,15 +17,14 @@ struct DeviceTensorDescriptor
__host__
__device__
unsigned
GetLength
(
unsigned
i
)
const
{
return
mpLengths
[
i
];
}
__host__
__device__
unsigned
GetLength
(
unsigned
i
)
const
{
return
mpLengths
[
i
];
}
__host__
__device__
unsigned
long
GetStride
(
unsigned
i
)
const
{
return
mpStrides
[
i
];
}
__host__
__device__
unsigned
GetStride
(
unsigned
i
)
const
{
return
mpStrides
[
i
];
}
// this is ugly
// this is ugly
__host__
__device__
unsigned
long
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
{
{
return
n
*
mpStrides
[
0
]
+
c
*
mpStrides
[
1
]
+
h
*
mpStrides
[
2
]
+
w
*
mpStrides
[
3
];
return
n
*
mpStrides
[
0
]
+
c
*
mpStrides
[
1
]
+
h
*
mpStrides
[
2
]
+
w
*
mpStrides
[
3
];
}
}
unsigned
mpLengths
[
NDim
];
unsigned
mpLengths
[
NDim
];
unsigned
long
mpStrides
[
NDim
];
unsigned
mpStrides
[
NDim
];
};
};
src/include/direct_convolution.cuh
View file @
9657baec
...
@@ -14,11 +14,29 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
...
@@ -14,11 +14,29 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
F
f
)
F
f
)
{
{
#if 1
#if 1
if
(
threadIdx
.
x
<
10
0
)
if
(
threadIdx
.
x
==
0
)
{
{
printf
(
"====== blockwise_4d_tensor_op:
\t
"
printf
(
"blockwise_4d_tensor_op: 0:
\t
"
"threadIdx.x %u, p_src[threadIdx.x] %f, p_dst[threadIdx.x] %f
\n
"
,
"threadIdx.x %u
\t
"
threadIdx
.
x
,
p_src
[
threadIdx
.
x
],
p_dst
[
threadIdx
.
x
]);
"src_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"dst_desc {%u %u %u %u}, {%u %u %u %u}
\n
"
,
threadIdx
.
x
,
src_desc
.
GetLength
(
0
),
src_desc
.
GetLength
(
1
),
src_desc
.
GetLength
(
2
),
src_desc
.
GetLength
(
3
),
src_desc
.
GetStride
(
0
),
src_desc
.
GetStride
(
1
),
src_desc
.
GetStride
(
2
),
src_desc
.
GetStride
(
3
),
dst_desc
.
GetLength
(
0
),
dst_desc
.
GetLength
(
1
),
dst_desc
.
GetLength
(
2
),
dst_desc
.
GetLength
(
3
),
dst_desc
.
GetStride
(
0
),
dst_desc
.
GetStride
(
1
),
dst_desc
.
GetStride
(
2
),
dst_desc
.
GetStride
(
3
));
}
}
#endif
#endif
...
@@ -60,13 +78,21 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
...
@@ -60,13 +78,21 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
dst_desc
.
GetStride
(
0
)
*
did0
+
dst_desc
.
GetStride
(
1
)
*
did1
+
dst_desc
.
GetStride
(
0
)
*
did0
+
dst_desc
.
GetStride
(
1
)
*
did1
+
dst_desc
.
GetStride
(
2
)
*
did2
+
dst_desc
.
GetStride
(
3
)
*
did3
;
dst_desc
.
GetStride
(
2
)
*
did2
+
dst_desc
.
GetStride
(
3
)
*
did3
;
f
(
p_
dst
[
dindex
],
p_
src
[
sindex
]);
f
(
p_
src
[
dindex
],
p_
dst
[
sindex
]);
#if 1
#if 1
printf
(
"thread id %u, dindex %u, p_dst[dindex] %f, sindex %u, p_src[sindex] %f
\n
"
,
// if(threadIdx.x == 0)
threadIdx
.
x
,
dindex
,
p_dst
[
dindex
],
sindex
,
p_src
[
sindex
]);
{
printf
(
"blockwise_4d_tensor_op: 1: thread id %u,
\t
"
"sindex %u, p_src[sindex] %f,
\t
"
"dindex %u, p_dst[dindex] %f
\n
"
,
threadIdx
.
x
,
sindex
,
p_src
[
sindex
],
dindex
,
p_dst
[
dindex
]);
}
#endif
#endif
}
}
}
}
}
}
...
@@ -80,6 +106,33 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
...
@@ -80,6 +106,33 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
TFloat
*
__restrict__
p_dst
,
TFloat
*
__restrict__
p_dst
,
F
f
)
F
f
)
{
{
#if 1
if
(
threadIdx
.
x
==
0
)
{
printf
(
"threadwise_4d_tensor_op: 0:
\t
"
"threadIdx.x %u
\t
"
"src_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"dst_desc {%u %u %u %u}, {%u %u %u %u}
\n
"
,
threadIdx
.
x
,
src_desc
.
GetLength
(
0
),
src_desc
.
GetLength
(
1
),
src_desc
.
GetLength
(
2
),
src_desc
.
GetLength
(
3
),
src_desc
.
GetStride
(
0
),
src_desc
.
GetStride
(
1
),
src_desc
.
GetStride
(
2
),
src_desc
.
GetStride
(
3
),
dst_desc
.
GetLength
(
0
),
dst_desc
.
GetLength
(
1
),
dst_desc
.
GetLength
(
2
),
dst_desc
.
GetLength
(
3
),
dst_desc
.
GetStride
(
0
),
dst_desc
.
GetStride
(
1
),
dst_desc
.
GetStride
(
2
),
dst_desc
.
GetStride
(
3
));
}
#endif
for
(
unsigned
did0
=
0
;
did0
<
src_desc
.
GetLength
(
0
);
++
did0
)
for
(
unsigned
did0
=
0
;
did0
<
src_desc
.
GetLength
(
0
);
++
did0
)
{
{
for
(
unsigned
did1
=
0
;
did1
<
src_desc
.
GetLength
(
1
);
++
did1
)
for
(
unsigned
did1
=
0
;
did1
<
src_desc
.
GetLength
(
1
);
++
did1
)
...
@@ -96,7 +149,21 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
...
@@ -96,7 +149,21 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
dst_desc
.
GetStride
(
0
)
*
did0
+
dst_desc
.
GetStride
(
1
)
*
did1
+
dst_desc
.
GetStride
(
0
)
*
did0
+
dst_desc
.
GetStride
(
1
)
*
did1
+
dst_desc
.
GetStride
(
2
)
*
did2
+
dst_desc
.
GetStride
(
3
)
*
did3
;
dst_desc
.
GetStride
(
2
)
*
did2
+
dst_desc
.
GetStride
(
3
)
*
did3
;
f
(
p_dst
[
dindex
],
p_src
[
sindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 1
if
(
threadIdx
.
x
==
0
)
{
printf
(
"threadwise_4d_tensor_op: 1: thread id %u,
\t
"
"sindex %u, p_src[sindex] %f,
\t
"
"dindex %u, p_dst[dindex] %f
\n
"
,
threadIdx
.
x
,
sindex
,
p_src
[
sindex
],
dindex
,
p_dst
[
dindex
]);
}
#endif
}
}
}
}
}
}
...
@@ -111,6 +178,72 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
...
@@ -111,6 +178,72 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
)
{
{
#if 1
if
(
threadIdx
.
x
==
0
)
{
printf
(
"threadwise_direct_convolution: 0:
\t
"
"threadIdx.x %u
\t
"
"in_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"wei_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"out_desc {%u %u %u %u}, {%u %u %u %u}
\n
"
,
threadIdx
.
x
,
in_desc
.
GetLength
(
0
),
in_desc
.
GetLength
(
1
),
in_desc
.
GetLength
(
2
),
in_desc
.
GetLength
(
3
),
in_desc
.
GetStride
(
0
),
in_desc
.
GetStride
(
1
),
in_desc
.
GetStride
(
2
),
in_desc
.
GetStride
(
3
),
wei_desc
.
GetLength
(
0
),
wei_desc
.
GetLength
(
1
),
wei_desc
.
GetLength
(
2
),
wei_desc
.
GetLength
(
3
),
wei_desc
.
GetStride
(
0
),
wei_desc
.
GetStride
(
1
),
wei_desc
.
GetStride
(
2
),
wei_desc
.
GetStride
(
3
),
out_desc
.
GetLength
(
0
),
out_desc
.
GetLength
(
1
),
out_desc
.
GetLength
(
2
),
out_desc
.
GetLength
(
3
),
out_desc
.
GetStride
(
0
),
out_desc
.
GetStride
(
1
),
out_desc
.
GetStride
(
2
),
out_desc
.
GetStride
(
3
));
}
#elif 1
{
printf
(
"threadwise_direct_convolution: 0:
\t
"
"threadIdx.x %u
\t
"
"p_in %f %f %f %f %f %f %f %f,
\t
"
"p_wei %f %f %f %f %f %f %f %f %f,
\t
"
"p_out %f %f %f %f,
\n
"
,
threadIdx
.
x
,
p_in
[
0
],
p_in
[
1
],
p_in
[
2
],
p_in
[
3
],
p_in
[
4
],
p_in
[
5
],
p_in
[
6
],
p_in
[
7
],
p_wei
[
0
],
p_wei
[
1
],
p_wei
[
2
],
p_wei
[
3
],
p_wei
[
4
],
p_wei
[
5
],
p_wei
[
6
],
p_wei
[
7
],
p_wei
[
8
],
p_out
[
0
],
p_out
[
1
],
p_out
[
2
],
p_out
[
3
]);
}
#endif
for
(
unsigned
n
=
0
;
n
<
out_desc
.
GetLength
(
0
);
++
n
)
for
(
unsigned
n
=
0
;
n
<
out_desc
.
GetLength
(
0
);
++
n
)
{
{
for
(
unsigned
k
=
0
;
k
<
out_desc
.
GetLength
(
1
);
++
k
)
for
(
unsigned
k
=
0
;
k
<
out_desc
.
GetLength
(
1
);
++
k
)
...
@@ -143,15 +276,20 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
...
@@ -143,15 +276,20 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
p_out
[
out_index
]
+=
p_wei
[
wei_index
]
*
p_in
[
in_index
];
p_out
[
out_index
]
+=
p_wei
[
wei_index
]
*
p_in
[
in_index
];
#if 1
#if 1
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
{
{
printf
(
"====== 5:
\t
"
printf
(
"threadwise_direct_convolution: 1:
\t
"
"threadIdx.x %u
\t
"
"out_index %u, p_out[out_index] %f,
\t
"
"out_index %u, p_out[out_index] %f,
\t
"
"wei_index %u, p_wei[wei_index] %f,
\t
"
"wei_index %u, p_wei[wei_index] %f,
\t
"
"in_index %u, p_in[in_index] %f
\n
"
,
"in_index %u, p_in[in_index] %f
\n
"
,
out_index
,
p_out
[
out_index
],
threadIdx
.
x
,
wei_index
,
p_wei
[
wei_index
],
out_index
,
in_index
,
p_in
[
in_index
]);
p_out
[
out_index
],
wei_index
,
p_wei
[
wei_index
],
in_index
,
p_in
[
in_index
]);
}
}
#endif
#endif
}
}
...
@@ -161,8 +299,6 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
...
@@ -161,8 +299,6 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
}
}
}
}
}
}
}
}
template
<
class
TFloat
,
template
<
class
TFloat
,
...
@@ -184,36 +320,87 @@ __device__ void blockwise_convolution(const DeviceTensorDescriptor<4>& in_desc,
...
@@ -184,36 +320,87 @@ __device__ void blockwise_convolution(const DeviceTensorDescriptor<4>& in_desc,
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
)
{
{
#if 1
if
(
threadIdx
.
x
==
0
)
{
printf
(
"blockwise_convolution: 0:
\t
"
"threadIdx.x %u
\t
"
"in_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"wei_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"out_desc {%u %u %u %u}, {%u %u %u %u}
\n
"
,
threadIdx
.
x
,
in_desc
.
GetLength
(
0
),
in_desc
.
GetLength
(
1
),
in_desc
.
GetLength
(
2
),
in_desc
.
GetLength
(
3
),
in_desc
.
GetStride
(
0
),
in_desc
.
GetStride
(
1
),
in_desc
.
GetStride
(
2
),
in_desc
.
GetStride
(
3
),
wei_desc
.
GetLength
(
0
),
wei_desc
.
GetLength
(
1
),
wei_desc
.
GetLength
(
2
),
wei_desc
.
GetLength
(
3
),
wei_desc
.
GetStride
(
0
),
wei_desc
.
GetStride
(
1
),
wei_desc
.
GetStride
(
2
),
wei_desc
.
GetStride
(
3
),
out_desc
.
GetLength
(
0
),
out_desc
.
GetLength
(
1
),
out_desc
.
GetLength
(
2
),
out_desc
.
GetLength
(
3
),
out_desc
.
GetStride
(
0
),
out_desc
.
GetStride
(
1
),
out_desc
.
GetStride
(
2
),
out_desc
.
GetStride
(
3
));
}
#endif
// for now, one thread do 1 N and 1 K
// for now, one thread do 1 N and 1 K
DeviceTensorDescriptor
<
4
>
wei_thread_desc
;
DeviceTensorDescriptor
<
4
>
in_thread_src_desc
=
in_desc
;
wei_thread_desc
.
mpLengths
[
0
]
=
1
;
in_thread_src_desc
.
mpLengths
[
0
]
=
1
;
wei_thread_desc
.
mpLengths
[
1
]
=
CPerBlockLoop
;
in_thread_src_desc
.
mpLengths
[
1
]
=
CPerBlockLoop
;
wei_thread_desc
.
mpLengths
[
2
]
=
S
;
in_thread_src_desc
.
mpLengths
[
2
]
=
OutTileSizeH
+
S
-
1
;
wei_thread_desc
.
mpLengths
[
3
]
=
R
;
in_thread_src_desc
.
mpLengths
[
3
]
=
OutTileSizeW
+
R
-
1
;
wei_thread_desc
.
mpStrides
[
3
]
=
1
;
wei_thread_desc
.
mpStrides
[
2
]
=
wei_thread_desc
.
GetLength
(
3
)
*
wei_thread_desc
.
GetStride
(
3
);
DeviceTensorDescriptor
<
4
>
wei_thread_src_desc
=
wei_desc
;
wei_thread_desc
.
mpStrides
[
1
]
=
wei_thread_desc
.
GetLength
(
2
)
*
wei_thread_desc
.
GetStride
(
2
);
wei_thread_src_desc
.
mpLengths
[
0
]
=
1
;
wei_thread_desc
.
mpStrides
[
0
]
=
wei_thread_desc
.
GetLength
(
1
)
*
wei_thread_desc
.
GetStride
(
1
);
wei_thread_src_desc
.
mpLengths
[
1
]
=
CPerBlockLoop
;
wei_thread_src_desc
.
mpLengths
[
2
]
=
S
;
DeviceTensorDescriptor
<
4
>
out_thread_desc
;
wei_thread_src_desc
.
mpLengths
[
3
]
=
R
;
out_thread_desc
.
mpLengths
[
0
]
=
1
;
out_thread_desc
.
mpLengths
[
1
]
=
1
;
DeviceTensorDescriptor
<
4
>
out_thread_src_desc
=
out_desc
;
out_thread_desc
.
mpLengths
[
2
]
=
OutTileSizeH
;
out_thread_src_desc
.
mpLengths
[
0
]
=
1
;
out_thread_desc
.
mpLengths
[
3
]
=
OutTileSizeW
;
out_thread_src_desc
.
mpLengths
[
1
]
=
1
;
out_thread_desc
.
mpStrides
[
3
]
=
1
;
out_thread_src_desc
.
mpLengths
[
2
]
=
OutTileSizeH
;
out_thread_desc
.
mpStrides
[
2
]
=
out_thread_desc
.
GetLength
(
3
)
*
out_thread_desc
.
GetStride
(
3
);
out_thread_src_desc
.
mpLengths
[
3
]
=
OutTileSizeW
;
out_thread_desc
.
mpStrides
[
1
]
=
out_thread_desc
.
GetLength
(
2
)
*
out_thread_desc
.
GetStride
(
2
);
out_thread_desc
.
mpStrides
[
0
]
=
out_thread_desc
.
GetLength
(
1
)
*
out_thread_desc
.
GetStride
(
1
);
DeviceTensorDescriptor
<
4
>
in_thread_dst_desc
=
in_thread_src_desc
;
in_thread_dst_desc
.
mpStrides
[
3
]
=
1
;
DeviceTensorDescriptor
<
4
>
in_thread_desc
;
in_thread_dst_desc
.
mpStrides
[
2
]
=
in_thread_desc
.
mpLengths
[
0
]
=
1
;
in_thread_dst_desc
.
GetLength
(
3
)
*
in_thread_dst_desc
.
GetStride
(
3
);
in_thread_desc
.
mpLengths
[
1
]
=
CPerBlockLoop
;
in_thread_dst_desc
.
mpStrides
[
1
]
=
in_thread_desc
.
mpLengths
[
2
]
=
OutTileSizeH
+
S
-
1
;
in_thread_dst_desc
.
GetLength
(
2
)
*
in_thread_dst_desc
.
GetStride
(
2
);
in_thread_desc
.
mpLengths
[
3
]
=
OutTileSizeW
+
R
-
1
;
in_thread_dst_desc
.
mpStrides
[
0
]
=
in_thread_desc
.
mpStrides
[
3
]
=
1
;
in_thread_dst_desc
.
GetLength
(
1
)
*
in_thread_dst_desc
.
GetStride
(
1
);
in_thread_desc
.
mpStrides
[
2
]
=
in_thread_desc
.
GetLength
(
3
)
*
in_thread_desc
.
GetStride
(
3
);
in_thread_desc
.
mpStrides
[
1
]
=
in_thread_desc
.
GetLength
(
2
)
*
in_thread_desc
.
GetStride
(
2
);
DeviceTensorDescriptor
<
4
>
wei_thread_dst_desc
=
wei_thread_src_desc
;
in_thread_desc
.
mpStrides
[
0
]
=
in_thread_desc
.
GetLength
(
1
)
*
in_thread_desc
.
GetStride
(
1
);
wei_thread_dst_desc
.
mpStrides
[
3
]
=
1
;
wei_thread_dst_desc
.
mpStrides
[
2
]
=
wei_thread_dst_desc
.
GetLength
(
3
)
*
wei_thread_dst_desc
.
GetStride
(
3
);
wei_thread_dst_desc
.
mpStrides
[
1
]
=
wei_thread_dst_desc
.
GetLength
(
2
)
*
wei_thread_dst_desc
.
GetStride
(
2
);
wei_thread_dst_desc
.
mpStrides
[
0
]
=
wei_thread_dst_desc
.
GetLength
(
1
)
*
wei_thread_dst_desc
.
GetStride
(
1
);
DeviceTensorDescriptor
<
4
>
out_thread_dst_desc
=
out_thread_src_desc
;
out_thread_dst_desc
.
mpStrides
[
3
]
=
1
;
out_thread_dst_desc
.
mpStrides
[
2
]
=
out_thread_dst_desc
.
GetLength
(
3
)
*
out_thread_dst_desc
.
GetStride
(
3
);
out_thread_dst_desc
.
mpStrides
[
1
]
=
out_thread_dst_desc
.
GetLength
(
2
)
*
out_thread_dst_desc
.
GetStride
(
2
);
out_thread_dst_desc
.
mpStrides
[
0
]
=
out_thread_dst_desc
.
GetLength
(
1
)
*
out_thread_dst_desc
.
GetStride
(
1
);
const
unsigned
thread_sz
=
blockDim
.
x
*
blockDim
.
y
*
blockDim
.
z
;
const
unsigned
thread_sz
=
blockDim
.
x
*
blockDim
.
y
*
blockDim
.
z
;
...
@@ -248,45 +435,45 @@ __device__ void blockwise_convolution(const DeviceTensorDescriptor<4>& in_desc,
...
@@ -248,45 +435,45 @@ __device__ void blockwise_convolution(const DeviceTensorDescriptor<4>& in_desc,
// copy input tensor into register
// copy input tensor into register
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
in_desc
,
in_
thread_src_
desc
,
p_in
+
in_desc
.
Get1dIndex
(
p_in
+
in_desc
.
Get1dIndex
(
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
in_thread_desc
,
in_thread_
dst_
desc
,
p_in_thread
,
p_in_thread
,
f_copy
);
f_copy
);
// copy weight tensor into register
// copy weight tensor into register
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
wei_desc
,
wei_
thread_src_
desc
,
p_wei
+
wei_
thread_
desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
p_wei
+
wei_desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
wei_thread_desc
,
wei_thread_
dst_
desc
,
p_wei_thread
,
p_wei_thread
,
f_copy
);
f_copy
);
// copy output tensor into register
// copy output tensor into register
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
out_desc
,
out_
thread_src_
desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
),
wo_thread_work_begin
),
out_thread_desc
,
out_thread_
dst_
desc
,
p_out_thread
,
p_out_thread
,
f_copy
);
f_copy
);
// threadwise convolution
// threadwise convolution
threadwise_direct_convolution
(
in_thread_desc
,
threadwise_direct_convolution
(
in_thread_
dst_
desc
,
p_in_thread
,
p_in_thread
,
wei_thread_desc
,
wei_thread_
dst_
desc
,
p_wei_thread
,
p_wei_thread
,
out_thread_desc
,
out_thread_
dst_
desc
,
p_out_thread
);
p_out_thread
);
// accumulate output tensor into device mem
// accumulate output tensor into device mem
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
f_copy
)
>
(
out_thread_desc
,
out_thread_
dst_
desc
,
p_out_thread
,
p_out_thread
,
out_desc
,
out_
thread_src_
desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
ho_thread_work_begin
,
...
@@ -315,11 +502,38 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
...
@@ -315,11 +502,38 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
)
{
{
#if 1
#if 1
if
(
threadIdx
.
x
<
10
0
)
if
(
threadIdx
.
x
==
0
)
{
{
printf
(
"====== 0:
\t
"
printf
(
"gridwise_convolution: 0:
\t
"
"threadIdx.x %u, p_in[threadIdx.x] %f, p_wei[threadIdx.x] %f, p_out[threadIdx.x] %f
\n
"
,
"threadIdx.x %u
\t
"
threadIdx
.
x
,
p_in
[
threadIdx
.
x
],
p_wei
[
threadIdx
.
x
],
p_out
[
threadIdx
.
x
]);
"in_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"wei_desc {%u %u %u %u}, {%u %u %u %u}
\t
"
"out_desc {%u %u %u %u}, {%u %u %u %u}
\n
"
,
threadIdx
.
x
,
in_desc
.
GetLength
(
0
),
in_desc
.
GetLength
(
1
),
in_desc
.
GetLength
(
2
),
in_desc
.
GetLength
(
3
),
in_desc
.
GetStride
(
0
),
in_desc
.
GetStride
(
1
),
in_desc
.
GetStride
(
2
),
in_desc
.
GetStride
(
3
),
wei_desc
.
GetLength
(
0
),
wei_desc
.
GetLength
(
1
),
wei_desc
.
GetLength
(
2
),
wei_desc
.
GetLength
(
3
),
wei_desc
.
GetStride
(
0
),
wei_desc
.
GetStride
(
1
),
wei_desc
.
GetStride
(
2
),
wei_desc
.
GetStride
(
3
),
out_desc
.
GetLength
(
0
),
out_desc
.
GetLength
(
1
),
out_desc
.
GetLength
(
2
),
out_desc
.
GetLength
(
3
),
out_desc
.
GetStride
(
0
),
out_desc
.
GetStride
(
1
),
out_desc
.
GetStride
(
2
),
out_desc
.
GetStride
(
3
));
}
}
#endif
#endif
...
@@ -363,9 +577,9 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
...
@@ -363,9 +577,9 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
in_block_desc
.
mpStrides
[
1
]
=
in_block_desc
.
GetLength
(
2
)
*
in_block_desc
.
GetStride
(
2
);
in_block_desc
.
mpStrides
[
1
]
=
in_block_desc
.
GetLength
(
2
)
*
in_block_desc
.
GetStride
(
2
);
in_block_desc
.
mpStrides
[
0
]
=
in_block_desc
.
GetLength
(
1
)
*
in_block_desc
.
GetStride
(
1
);
in_block_desc
.
mpStrides
[
0
]
=
in_block_desc
.
GetLength
(
1
)
*
in_block_desc
.
GetStride
(
1
);
__shared__
TFloat
p_in_block
[
NPerBlock
*
CPerBlockLoop
*
S
*
R
];
__shared__
TFloat
p_in_block
[
NPerBlock
*
CPerBlockLoop
*
(
YPerBlock
*
OutTileSizeH
+
S
-
1
)
*
__shared__
TFloat
p_wei_block
[
KPerBlock
*
CPerBlockLoop
*
(
Y
PerBlock
*
OutTileSize
H
+
S
-
1
)
*
(
X
PerBlock
*
OutTileSize
W
+
R
-
1
)
];
(
XPerBlock
*
OutTileSizeW
+
R
-
1
)
];
__shared__
TFloat
p_wei_block
[
KPerBlock
*
CPerBlockLoop
*
S
*
R
];
__shared__
TFloat
p_out_block
[
NPerBlock
*
KPerBlock
*
(
YPerBlock
*
OutTileSizeH
)
*
__shared__
TFloat
p_out_block
[
NPerBlock
*
KPerBlock
*
(
YPerBlock
*
OutTileSizeH
)
*
(
XPerBlock
*
OutTileSizeW
)];
(
XPerBlock
*
OutTileSizeW
)];
...
@@ -388,9 +602,6 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
...
@@ -388,9 +602,6 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
unsigned
hi_block_work_begin
=
ho_block_work_begin
;
// minus padding
unsigned
hi_block_work_begin
=
ho_block_work_begin
;
// minus padding
unsigned
wi_block_work_begin
=
wo_block_work_begin
;
// minus padding
unsigned
wi_block_work_begin
=
wo_block_work_begin
;
// minus padding
if
(
threadIdx
.
x
==
0
)
printf
(
"====== 1:
\n
"
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_desc
.
GetLength
(
1
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_desc
.
GetLength
(
1
);
c_block_work_begin
+=
CPerBlockLoop
)
c_block_work_begin
+=
CPerBlockLoop
)
{
{
...
@@ -426,6 +637,8 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
...
@@ -426,6 +637,8 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
p_out_block
,
p_out_block
,
f_copy
);
f_copy
);
__syncthreads
();
// blockwise convolution
// blockwise convolution
blockwise_convolution
<
TFloat
,
blockwise_convolution
<
TFloat
,
S
,
S
,
...
@@ -441,8 +654,7 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
...
@@ -441,8 +654,7 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
CPerBlockLoop
>
(
CPerBlockLoop
>
(
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
if
(
threadIdx
.
x
==
0
)
__syncthreads
();
printf
(
"====== 3:
\n
"
);
// accum output tensor from LDS to device mem
// accum output tensor from LDS to device mem
blockwise_4d_tensor_op
<
TFloat
,
1
,
1
,
1
,
64
,
decltype
(
f_copy
)
>
(
blockwise_4d_tensor_op
<
TFloat
,
1
,
1
,
1
,
64
,
decltype
(
f_copy
)
>
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment