Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
73480fee
"references/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "d88d8961ae51507d0cb680329d985b1488b1b76b"
Commit
73480fee
authored
Nov 15, 2018
by
Chao Liu
Browse files
refactor
parent
adf4b173
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
115 additions
and
209 deletions
+115
-209
driver/conv.cu
driver/conv.cu
+1
-90
driver/device_direct_convolution_2.cuh
driver/device_direct_convolution_2.cuh
+91
-0
src/include/direct_convolution_2.cuh
src/include/direct_convolution_2.cuh
+16
-16
src/include/threadwise_tensor_op.cuh
src/include/threadwise_tensor_op.cuh
+7
-103
No files found.
driver/conv.cu
View file @
73480fee
...
...
@@ -5,7 +5,7 @@
#include "nvToolsExt.h"
#include "tensor.hpp"
#include "constant_tensor_descriptor.cuh"
#include "direct_convolution_2.cuh"
#include "
device_
direct_convolution_2.cuh"
template
<
class
T
>
struct
GeneratorConstant
...
...
@@ -109,95 +109,6 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
f_par
(
std
::
thread
::
hardware_concurrency
());
}
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
4
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
YPerBlock
=
8
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
NBlockOpLen0
=
1
;
constexpr
unsigned
NBlockOpLen1
=
1
;
constexpr
unsigned
NBlockOpLen2
=
4
;
constexpr
unsigned
NBlockOpLen3
=
32
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
(
OutTileSizeH
*
YPerBlock
))
*
(
out_desc
.
GetLength
(
I3
)
/
(
OutTileSizeW
*
XPerBlock
));
dim3
block_dim
(
BlockSize
);
dim3
grid_dim
(
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
cudaEvent_t
start
,
stop
;
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_convolution
<
T
,
InDesc
,
WeiDesc
,
OutDesc
,
OutTileSizeH
,
OutTileSizeW
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
YPerBlock
,
XPerBlock
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
BlockSize
,
GridSize
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
WeiDesc
{},
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
cudaEventCreate
(
&
stop
);
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
int
main
()
{
#if 0
...
...
driver/device_direct_convolution_2.cuh
0 → 100644
View file @
73480fee
#pragma once
#include "direct_convolution_2.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
4
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
YPerBlock
=
8
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
NBlockOpLen0
=
1
;
constexpr
unsigned
NBlockOpLen1
=
1
;
constexpr
unsigned
NBlockOpLen2
=
4
;
constexpr
unsigned
NBlockOpLen3
=
32
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
(
OutTileSizeH
*
YPerBlock
))
*
(
out_desc
.
GetLength
(
I3
)
/
(
OutTileSizeW
*
XPerBlock
));
dim3
block_dim
(
BlockSize
);
dim3
grid_dim
(
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
cudaEvent_t
start
,
stop
;
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_convolution
<
T
,
InDesc
,
WeiDesc
,
OutDesc
,
OutTileSizeH
,
OutTileSizeW
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
YPerBlock
,
XPerBlock
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
BlockSize
,
GridSize
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
WeiDesc
{},
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
cudaEventCreate
(
&
stop
);
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
src/include/direct_convolution_2.cuh
View file @
73480fee
...
...
@@ -92,10 +92,10 @@ __device__ void blockwise_convolution(InDesc,
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
// copy input tensor into register
threadwise_4d_tensor_op_in
<
TFloat
,
decltype
(
in_thread_src_desc
),
decltype
(
in_thread_dst_desc
),
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op_
b
in
ary
<
TFloat
,
decltype
(
in_thread_src_desc
),
decltype
(
in_thread_dst_desc
),
decltype
(
f_copy
)
>
(
in_thread_src_desc
,
p_in_lds
+
in_desc
.
Get1dIndex
(
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
...
...
@@ -107,10 +107,10 @@ __device__ void blockwise_convolution(InDesc,
++
k_thread_work_begin
)
{
// copy weight tensor into register
threadwise_4d_tensor_op_
wei
<
TFloat
,
decltype
(
wei_thread_src_desc
),
decltype
(
wei_thread_dst_desc
),
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op_
binary
<
TFloat
,
decltype
(
wei_thread_src_desc
),
decltype
(
wei_thread_dst_desc
),
decltype
(
f_copy
)
>
(
wei_thread_src_desc
,
p_wei_lds
+
wei_desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
wei_thread_dst_desc
,
...
...
@@ -118,10 +118,10 @@ __device__ void blockwise_convolution(InDesc,
f_copy
);
// copy output tensor into register
threadwise_4d_tensor_op_
out
<
TFloat
,
decltype
(
out_thread_src_desc
),
decltype
(
out_thread_dst_desc
),
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op_
binary
<
TFloat
,
decltype
(
out_thread_src_desc
),
decltype
(
out_thread_dst_desc
),
decltype
(
f_copy
)
>
(
out_thread_src_desc
,
p_out_lds
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
...
...
@@ -143,10 +143,10 @@ __device__ void blockwise_convolution(InDesc,
p_out_thread
);
// accumulate output tensor into LDS
threadwise_4d_tensor_op_
out
<
TFloat
,
decltype
(
out_thread_dst_desc
),
decltype
(
out_thread_src_desc
),
decltype
(
f_copy
)
>
(
threadwise_4d_tensor_op_
binary
<
TFloat
,
decltype
(
out_thread_dst_desc
),
decltype
(
out_thread_src_desc
),
decltype
(
f_copy
)
>
(
out_thread_dst_desc
,
p_out_thread
,
out_thread_src_desc
,
...
...
src/include/threadwise_tensor_op.cuh
View file @
73480fee
...
...
@@ -5,117 +5,35 @@
#if THREADWISE_TENSOR_OP_METHOD == 0
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op_in
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(src_desc);
print_ConstantTensorDescriptor(dst_desc);
}
#endif
for
(
unsigned
did0
=
0
;
did0
<
src_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
src_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
unsigned
did2
=
0
;
did2
<
src_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
unsigned
did3
=
0
;
did3
<
src_desc
.
GetLength
(
I3
);
++
did3
)
{
const
unsigned
sindex
=
src_desc
.
GetStride
(
I0
)
*
did0
+
src_desc
.
GetStride
(
I1
)
*
did1
+
src_desc
.
GetStride
(
I2
)
*
did2
+
src_desc
.
GetStride
(
I3
)
*
did3
;
const
unsigned
dindex
=
dst_desc
.
GetStride
(
I0
)
*
did0
+
dst_desc
.
GetStride
(
I1
)
*
did1
+
dst_desc
.
GetStride
(
I2
)
*
did2
+
dst_desc
.
GetStride
(
I3
)
*
did3
;
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
}
}
}
}
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op_wei
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
__device__
void
threadwise_4d_tensor_op_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(src_desc);
print_ConstantTensorDescriptor(dst_desc);
}
#endif
for
(
unsigned
did0
=
0
;
did0
<
src
_desc
.
GetLength
(
I0
);
++
did0
)
for
(
unsigned
did0
=
0
;
did0
<
dst
_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
src
_desc
.
GetLength
(
I1
);
++
did1
)
for
(
unsigned
did1
=
0
;
did1
<
dst
_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
unsigned
did2
=
0
;
did2
<
src
_desc
.
GetLength
(
I2
);
++
did2
)
for
(
unsigned
did2
=
0
;
did2
<
dst
_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
unsigned
did3
=
0
;
did3
<
src
_desc
.
GetLength
(
I3
);
++
did3
)
for
(
unsigned
did3
=
0
;
did3
<
dst
_desc
.
GetLength
(
I3
);
++
did3
)
{
const
unsigned
sindex
=
src_desc
.
GetStride
(
I0
)
*
did0
+
src_desc
.
GetStride
(
I1
)
*
did1
+
src_desc
.
GetStride
(
I2
)
*
did2
+
src_desc
.
GetStride
(
I3
)
*
did3
;
const
unsigned
dindex
=
dst_desc
.
GetStride
(
I0
)
*
did0
+
dst_desc
.
GetStride
(
I1
)
*
did1
+
dst_desc
.
GetStride
(
I2
)
*
did2
+
dst_desc
.
GetStride
(
I3
)
*
did3
;
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
f
(
p_dst
[
dindex
]);
}
}
}
...
...
@@ -123,7 +41,7 @@ __device__ void threadwise_4d_tensor_op_wei(
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op_
out
(
__device__
void
threadwise_4d_tensor_op_
binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
...
...
@@ -161,20 +79,6 @@ __device__ void threadwise_4d_tensor_op_out(
dst_desc
.
GetStride
(
I2
)
*
did2
+
dst_desc
.
GetStride
(
I3
)
*
did3
;
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment