Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
1b648f2f
Commit
1b648f2f
authored
Nov 04, 2018
by
Chao Liu
Browse files
use constant tensor descriptor
parent
9657baec
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
913 additions
and
61 deletions
+913
-61
driver/conv.cu
driver/conv.cu
+99
-29
src/include/constant_direct_convolution.cuh
src/include/constant_direct_convolution.cuh
+561
-0
src/include/constant_tensor_descriptor.cuh
src/include/constant_tensor_descriptor.cuh
+169
-0
src/include/device_tensor_descriptor.cuh
src/include/device_tensor_descriptor.cuh
+60
-0
src/include/direct_convolution.cuh
src/include/direct_convolution.cuh
+10
-10
src/include/tensor.hpp
src/include/tensor.hpp
+11
-14
src/tensor.cpp
src/tensor.cpp
+3
-8
No files found.
driver/conv.cu
View file @
1b648f2f
...
@@ -3,8 +3,14 @@
...
@@ -3,8 +3,14 @@
#include <initializer_list>
#include <initializer_list>
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "tensor.hpp"
#include "tensor.hpp"
#include "device_tensor.cuh"
#include "constant_tensor_descriptor.cuh"
#include "device_tensor_descriptor.cuh"
#if 0
#include "direct_convolution.cuh"
#include "direct_convolution.cuh"
#else
#include "constant_direct_convolution.cuh"
#endif
template
<
class
T
>
template
<
class
T
>
struct
GeneratorConstant
struct
GeneratorConstant
...
@@ -38,11 +44,46 @@ struct GeneratorTensor
...
@@ -38,11 +44,46 @@ struct GeneratorTensor
}
}
};
};
template
<
typename
T
>
// this is ugly, only for 4d
void
host_convolution
(
const
Tensor
<
T
>&
in
,
template
<
class
TConstTensorDesc
>
const
Tensor
<
T
>&
wei
,
void
ostream_ConstantTensorDescriptor
(
TConstTensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
Tensor
<
T
>&
out
,
{
std
::
size_t
num_thread
)
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
os
<<
"Lengths: {"
<<
desc
.
GetLength
(
I0
)
<<
", "
<<
desc
.
GetLength
(
I1
)
<<
", "
<<
desc
.
GetLength
(
I2
)
<<
", "
<<
desc
.
GetLength
(
I3
)
<<
"}, "
<<
"Strides: {"
<<
desc
.
GetStride
(
I0
)
<<
", "
<<
desc
.
GetStride
(
I1
)
<<
", "
<<
desc
.
GetStride
(
I2
)
<<
", "
<<
desc
.
GetStride
(
I3
)
<<
"}"
<<
std
::
endl
;
}
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
auto
make_TensorDescriptor
(
TConstTensorDesc
)
{
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
std
::
initializer_list
<
unsigned
>
lengths
=
{
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
)};
std
::
initializer_list
<
unsigned
>
strides
=
{
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
)};
return
TensorDescriptor
(
lengths
,
strides
);
}
template
<
class
T
>
void
host_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
{
{
auto
f
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
auto
f
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
double
v
=
0
;
...
@@ -67,12 +108,12 @@ void host_convolution(const Tensor<T>& in,
...
@@ -67,12 +108,12 @@ void host_convolution(const Tensor<T>& in,
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
]);
out
.
mDesc
.
GetLengths
()[
3
]);
f_par
(
num_
thread
);
f_par
(
std
::
thread
::
hardware_concurrency
()
);
}
}
template
<
class
T
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
void
device_convolution
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
{
{
DeviceTensorDescriptor
<
4
>
in_desc_device
(
in
.
mDesc
);
DeviceTensorDescriptor
<
4
>
in_desc_device
(
in
.
mDesc
);
DeviceTensorDescriptor
<
4
>
wei_desc_device
(
wei
.
mDesc
);
DeviceTensorDescriptor
<
4
>
wei_desc_device
(
wei
.
mDesc
);
...
@@ -103,6 +144,7 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
...
@@ -103,6 +144,7 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
dim3
block_dim
(
64
,
1
,
1
);
dim3
block_dim
(
64
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
#if 0
gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
<<<grid_dim, block_dim>>>(in_desc_device,
<<<grid_dim, block_dim>>>(in_desc_device,
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
...
@@ -110,6 +152,15 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
...
@@ -110,6 +152,15 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
out_desc_device,
out_desc_device,
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
#else
gridwise_convolution
<
T
,
InDesc
,
WeiDesc
,
OutDesc
,
4
,
4
,
2
,
2
,
1
,
1
,
8
,
8
,
1
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
WeiDesc
{},
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
#endif
checkCudaErrors
(
cudaGetLastError
());
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
...
@@ -117,34 +168,53 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
...
@@ -117,34 +168,53 @@ void device_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& ou
int
main
()
int
main
()
{
{
#if 0
#if 1
Tensor<float> in({3, 16, 130, 130});
constexpr
unsigned
N
=
1
;
Tensor<float> wei({4, 16, 3, 3});
constexpr
unsigned
C
=
1
;
Tensor<float> out_host({3, 4, 128, 128});
constexpr
unsigned
HI
=
18
;
constexpr
unsigned
WI
=
18
;
constexpr
unsigned
K
=
1
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
R
=
3
;
#elif 0
#elif 0
Tensor
<
float
>
in
({
1
,
1
,
130
,
130
});
constexpr
unsigned
N
=
1
;
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
constexpr
unsigned
C
=
1
;
Tensor
<
float
>
out_host
({
1
,
1
,
128
,
128
});
constexpr
unsigned
HI
=
130
;
#elif 1
constexpr
unsigned
WI
=
130
;
Tensor
<
float
>
in
({
1
,
1
,
18
,
18
});
constexpr
unsigned
K
=
1
;
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
constexpr
unsigned
S
=
3
;
Tensor
<
float
>
out_host
({
1
,
1
,
16
,
16
});
constexpr
unsigned
R
=
3
;
#else
#elif 0
Tensor
<
float
>
in
({
1
,
1
,
4
,
4
});
constexpr
unsigned
N
=
3
;
Tensor
<
float
>
wei
({
1
,
1
,
3
,
3
});
constexpr
unsigned
C
=
16
;
Tensor
<
float
>
out_host
({
1
,
1
,
2
,
2
});
constexpr
unsigned
HI
=
130
;
constexpr
unsigned
WI
=
130
;
constexpr
unsigned
K
=
4
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
R
=
3
;
#endif
#endif
auto
in_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
S
,
R
>
{});
auto
out_desc
=
get_output_4d_tensor_descriptor
(
in_desc
,
wei_desc
);
ostream_ConstantTensorDescriptor
(
in_desc
,
std
::
cout
<<
"in_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_desc
,
std
::
cout
<<
"wei_desc: "
);
ostream_ConstantTensorDescriptor
(
out_desc
,
std
::
cout
<<
"out_desc: "
);
Tensor
<
float
>
in
(
make_TensorDescriptor
(
in_desc
));
Tensor
<
float
>
wei
(
make_TensorDescriptor
(
wei_desc
));
Tensor
<
float
>
out_host
(
make_TensorDescriptor
(
out_desc
));
Tensor
<
float
>
out_device
=
out_host
;
Tensor
<
float
>
out_device
=
out_host
;
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
cout
<<
__func__
<<
": num_thread "
<<
num_thread
<<
std
::
endl
;
in
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
host_convolution
(
in
,
wei
,
out_host
,
num_thread
);
host_convolution
(
in
,
wei
,
out_host
);
device_convolution
(
in
,
wei
,
out_device
);
device_convolution
(
in
_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
...
...
src/include/constant_direct_convolution.cuh
0 → 100644
View file @
1b648f2f
#pragma once
#include "constant_tensor_descriptor.cuh"
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NWorkLen0
,
unsigned
NWorkLen1
,
unsigned
NWorkLen2
,
unsigned
NWorkLen3
,
class
F
>
__device__
void
blockwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
#if 1
if
(
threadIdx
.
x
==
0
)
{
print_ConstantTensorDescriptor
(
src_desc
);
print_ConstantTensorDescriptor
(
dst_desc
);
}
#endif
constexpr
unsigned
NWorkStride3
=
1
;
constexpr
unsigned
NWorkStride2
=
NWorkLen3
*
NWorkStride3
;
constexpr
unsigned
NWorkStride1
=
NWorkLen2
*
NWorkStride2
;
constexpr
unsigned
NWorkStride0
=
NWorkLen1
*
NWorkStride1
;
unsigned
itmp
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
z
*
(
blockDim
.
y
*
blockDim
.
x
);
const
unsigned
did0_begin
=
itmp
/
NWorkStride0
;
itmp
-=
did0_begin
*
NWorkStride0
;
const
unsigned
did1_begin
=
itmp
/
NWorkStride1
;
itmp
-=
did1_begin
*
NWorkStride1
;
const
unsigned
did2_begin
=
itmp
/
NWorkStride2
;
itmp
-=
did2_begin
*
NWorkStride2
;
const
unsigned
did3_begin
=
itmp
/
NWorkStride3
;
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NWorkLen0
)
{
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NWorkLen1
)
{
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NWorkLen2
)
{
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NWorkLen3
)
{
const
unsigned
sindex
=
src_desc
.
GetStride
(
I0
)
*
did0
+
src_desc
.
GetStride
(
I1
)
*
did1
+
src_desc
.
GetStride
(
I2
)
*
did2
+
src_desc
.
GetStride
(
I3
)
*
did3
;
const
unsigned
dindex
=
dst_desc
.
GetStride
(
I0
)
*
did0
+
dst_desc
.
GetStride
(
I1
)
*
did1
+
dst_desc
.
GetStride
(
I2
)
*
did2
+
dst_desc
.
GetStride
(
I3
)
*
did3
;
f
(
p_src
[
dindex
],
p_dst
[
sindex
]);
#if 0
// if(threadIdx.x == 0)
{
printf("blockwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
}
}
}
}
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
#if 1
if
(
threadIdx
.
x
==
0
)
{
print_ConstantTensorDescriptor
(
src_desc
);
print_ConstantTensorDescriptor
(
dst_desc
);
}
#endif
for
(
unsigned
did0
=
0
;
did0
<
src_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
src_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
unsigned
did2
=
0
;
did2
<
src_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
unsigned
did3
=
0
;
did3
<
src_desc
.
GetLength
(
I3
);
++
did3
)
{
const
unsigned
sindex
=
src_desc
.
GetStride
(
I0
)
*
did0
+
src_desc
.
GetStride
(
I1
)
*
did1
+
src_desc
.
GetStride
(
I2
)
*
did2
+
src_desc
.
GetStride
(
I3
)
*
did3
;
const
unsigned
dindex
=
dst_desc
.
GetStride
(
I0
)
*
did0
+
dst_desc
.
GetStride
(
I1
)
*
did1
+
dst_desc
.
GetStride
(
I2
)
*
did2
+
dst_desc
.
GetStride
(
I3
)
*
did3
;
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
}
}
}
}
}
template
<
class
TFloat
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
__device__
void
threadwise_direct_convolution
(
InDesc
,
TFloat
*
const
__restrict__
p_in
,
WeiDesc
,
TFloat
*
const
__restrict__
p_wei
,
OutDesc
,
TFloat
*
__restrict__
p_out
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
#if 1
if
(
threadIdx
.
x
==
0
)
{
print_ConstantTensorDescriptor
(
in_desc
);
print_ConstantTensorDescriptor
(
wei_desc
);
print_ConstantTensorDescriptor
(
out_desc
);
}
#endif
for
(
unsigned
n
=
0
;
n
<
out_desc
.
GetLength
(
I0
);
++
n
)
{
for
(
unsigned
k
=
0
;
k
<
out_desc
.
GetLength
(
I1
);
++
k
)
{
for
(
unsigned
ho
=
0
;
ho
<
out_desc
.
GetLength
(
I2
);
++
ho
)
{
for
(
unsigned
wo
=
0
;
wo
<
out_desc
.
GetLength
(
I3
);
++
wo
)
{
for
(
unsigned
c
=
0
;
c
<
wei_desc
.
GetLength
(
I1
);
++
c
)
{
for
(
unsigned
s
=
0
;
s
<
wei_desc
.
GetLength
(
I2
);
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
wei_desc
.
GetLength
(
I3
);
++
r
)
{
const
unsigned
hi
=
ho
+
s
;
const
unsigned
wi
=
wo
+
r
;
const
unsigned
in_index
=
in_desc
.
GetStride
(
I0
)
*
n
+
in_desc
.
GetStride
(
I1
)
*
c
+
in_desc
.
GetStride
(
I2
)
*
hi
+
in_desc
.
GetStride
(
I3
)
*
wi
;
const
unsigned
wei_index
=
wei_desc
.
GetStride
(
I0
)
*
k
+
wei_desc
.
GetStride
(
I1
)
*
c
+
wei_desc
.
GetStride
(
I2
)
*
s
+
in_desc
.
GetStride
(
I3
)
*
r
;
const
unsigned
out_index
=
out_desc
.
GetStride
(
I0
)
*
n
+
out_desc
.
GetStride
(
I1
)
*
k
+
out_desc
.
GetStride
(
I2
)
*
ho
+
out_desc
.
GetStride
(
I3
)
*
wo
;
p_out
[
out_index
]
+=
p_wei
[
wei_index
]
*
p_in
[
in_index
];
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_direct_convolution: 1: \t"
"threadIdx.x %u\t"
"out_index %u, p_out[out_index] %f, \t"
"wei_index %u, p_wei[wei_index] %f, \t"
"in_index %u, p_in[in_index] %f\n",
threadIdx.x,
out_index,
p_out[out_index],
wei_index,
p_wei[wei_index],
in_index,
p_in[in_index]);
}
#endif
}
}
}
}
}
}
}
}
template
<
class
TFloat
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
unsigned
S
,
unsigned
R
,
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
CPerBlockLoop
>
__device__
void
blockwise_convolution
(
InDesc
,
TFloat
*
const
__restrict__
p_in
,
WeiDesc
,
TFloat
*
const
__restrict__
p_wei
,
OutDesc
,
TFloat
*
__restrict__
p_out
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
#if 1
if
(
threadIdx
.
x
==
0
)
{
print_ConstantTensorDescriptor
(
in_desc
);
print_ConstantTensorDescriptor
(
wei_desc
);
print_ConstantTensorDescriptor
(
out_desc
);
}
#endif
constexpr
auto
in_thread_src_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
1
,
CPerBlockLoop
,
OutTileSizeH
+
S
-
1
,
OutTileSizeW
+
R
-
1
>
{},
in_desc
.
GetStrides
());
constexpr
auto
wei_thread_src_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
1
,
CPerBlockLoop
,
S
,
R
>
{},
wei_desc
.
GetStrides
());
constexpr
auto
out_thread_src_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
1
,
1
,
OutTileSizeH
,
OutTileSizeW
>
{},
out_desc
.
GetStrides
());
constexpr
auto
in_thread_dst_desc
=
make_ConstantTensorDescriptor
(
in_thread_src_desc
.
GetLengths
());
constexpr
auto
wei_thread_dst_desc
=
make_ConstantTensorDescriptor
(
wei_thread_src_desc
.
GetLengths
());
constexpr
auto
out_thread_dst_desc
=
make_ConstantTensorDescriptor
(
out_thread_src_desc
.
GetLengths
());
const
unsigned
thread_sz
=
blockDim
.
x
*
blockDim
.
y
*
blockDim
.
z
;
const
unsigned
thread_id
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
z
*
(
blockDim
.
y
*
blockDim
.
x
);
for
(
unsigned
thread_work_id
=
thread_id
;
thread_work_id
<
NPerBlock
*
KPerBlock
*
YPerBlock
*
XPerBlock
;
thread_work_id
+=
thread_sz
)
{
unsigned
itmp
=
thread_work_id
;
unsigned
n_thread_work_id
=
itmp
/
(
KPerBlock
*
YPerBlock
*
XPerBlock
);
itmp
-=
n_thread_work_id
*
(
KPerBlock
*
YPerBlock
*
XPerBlock
);
unsigned
k_thread_work_id
=
itmp
/
(
YPerBlock
*
XPerBlock
);
itmp
-=
k_thread_work_id
*
(
YPerBlock
*
XPerBlock
);
unsigned
y_thread_work_id
=
itmp
/
XPerBlock
;
unsigned
x_thread_work_id
=
itmp
-
y_thread_work_id
*
XPerBlock
;
unsigned
n_thread_work_begin
=
n_thread_work_id
*
1
;
unsigned
k_thread_work_begin
=
k_thread_work_id
*
1
;
unsigned
ho_thread_work_begin
=
y_thread_work_id
*
OutTileSizeH
;
unsigned
wo_thread_work_begin
=
x_thread_work_id
*
OutTileSizeW
;
unsigned
hi_thread_work_begin
=
ho_thread_work_begin
;
// minus padding
unsigned
wi_thread_work_begin
=
wo_thread_work_begin
;
// minus padding
TFloat
p_in_thread
[
1
*
CPerBlockLoop
*
InTileSizeH
*
InTileSizeW
];
TFloat
p_wei_thread
[
1
*
CPerBlockLoop
*
S
*
R
];
TFloat
p_out_thread
[
1
*
1
*
OutTileSizeH
*
OutTileSizeW
];
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
// copy input tensor into register
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
in_thread_src_desc
),
decltype
(
in_thread_dst_desc
),
decltype
(
f_copy
)
>
(
in_thread_src_desc
,
p_in
+
in_desc
.
Get1dIndex
(
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
in_thread_dst_desc
,
p_in_thread
,
f_copy
);
// copy weight tensor into register
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
wei_thread_src_desc
),
decltype
(
wei_thread_dst_desc
),
decltype
(
f_copy
)
>
(
wei_thread_src_desc
,
p_wei
+
wei_desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
wei_thread_dst_desc
,
p_wei_thread
,
f_copy
);
// copy output tensor into register
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
out_thread_src_desc
),
decltype
(
out_thread_dst_desc
),
decltype
(
f_copy
)
>
(
out_thread_src_desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
),
out_thread_dst_desc
,
p_out_thread
,
f_copy
);
// threadwise convolution
threadwise_direct_convolution
<
TFloat
,
decltype
(
in_thread_dst_desc
),
decltype
(
wei_thread_dst_desc
),
decltype
(
out_thread_dst_desc
)
>
(
in_thread_dst_desc
,
p_in_thread
,
wei_thread_dst_desc
,
p_wei_thread
,
out_thread_dst_desc
,
p_out_thread
);
// accumulate output tensor into device mem
threadwise_4d_tensor_op
<
TFloat
,
decltype
(
out_thread_dst_desc
),
decltype
(
out_thread_src_desc
),
decltype
(
f_copy
)
>
(
out_thread_dst_desc
,
p_out_thread
,
out_thread_src_desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
),
f_copy
);
}
}
template
<
class
TFloat
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
CPerBlockLoop
>
__global__
void
gridwise_convolution
(
InDesc
,
TFloat
*
const
__restrict__
p_in
,
WeiDesc
,
TFloat
*
const
__restrict__
p_wei
,
OutDesc
,
TFloat
*
__restrict__
p_out
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
unsigned
S
=
wei_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_desc
.
GetLength
(
I3
);
#if 1
if
(
threadIdx
.
x
==
0
)
{
print_ConstantTensorDescriptor
(
in_desc
);
print_ConstantTensorDescriptor
(
wei_desc
);
print_ConstantTensorDescriptor
(
out_desc
);
}
#endif
constexpr
unsigned
NBlockWork
=
(
in_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
YBlockWork
=
(
in_desc
.
GetLength
(
I2
)
+
YPerBlock
-
1
)
/
YPerBlock
;
constexpr
unsigned
XBlockWork
=
(
in_desc
.
GetLength
(
I3
)
+
XPerBlock
-
1
)
/
XPerBlock
;
constexpr
unsigned
KBlockWork
=
(
wei_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
const
unsigned
block_id
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
+
blockIdx
.
z
*
(
gridDim
.
y
*
gridDim
.
x
);
constexpr
auto
in_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlockLoop
,
YPerBlock
*
OutTileSizeH
+
S
-
1
,
XPerBlock
*
OutTileSizeW
+
R
-
1
>
{});
constexpr
auto
wei_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlockLoop
,
S
,
R
>
{});
constexpr
auto
out_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
KPerBlock
,
YPerBlock
*
OutTileSizeH
,
XPerBlock
*
OutTileSizeW
>
{});
__shared__
TFloat
p_in_block
[
NPerBlock
*
CPerBlockLoop
*
(
YPerBlock
*
OutTileSizeH
+
S
-
1
)
*
(
XPerBlock
*
OutTileSizeW
+
R
-
1
)];
__shared__
TFloat
p_wei_block
[
KPerBlock
*
CPerBlockLoop
*
S
*
R
];
__shared__
TFloat
p_out_block
[
NPerBlock
*
KPerBlock
*
(
YPerBlock
*
OutTileSizeH
)
*
(
XPerBlock
*
OutTileSizeW
)];
unsigned
itmp
=
block_id
;
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
unsigned
k_block_work_id
=
itmp
/
(
YBlockWork
*
XBlockWork
);
itmp
-=
k_block_work_id
*
(
YBlockWork
*
XBlockWork
);
unsigned
y_block_work_id
=
itmp
/
XBlockWork
;
unsigned
x_block_work_id
=
itmp
-
y_block_work_id
*
XBlockWork
;
unsigned
n_block_work_begin
=
n_block_work_id
*
NPerBlock
;
unsigned
k_block_work_begin
=
k_block_work_id
*
KPerBlock
;
unsigned
y_block_work_begin
=
y_block_work_id
*
YPerBlock
;
unsigned
x_block_work_begin
=
x_block_work_id
*
XPerBlock
;
unsigned
ho_block_work_begin
=
y_block_work_begin
*
OutTileSizeH
;
unsigned
wo_block_work_begin
=
x_block_work_begin
*
OutTileSizeW
;
unsigned
hi_block_work_begin
=
ho_block_work_begin
;
// minus padding
unsigned
wi_block_work_begin
=
wo_block_work_begin
;
// minus padding
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_desc
.
GetLength
(
I1
);
c_block_work_begin
+=
CPerBlockLoop
)
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
// copy input tensor to LDS
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
in_desc
),
decltype
(
in_block_desc
),
1
,
1
,
1
,
64
,
decltype
(
f_copy
)
>
(
in_desc
,
p_in
+
in_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
in_block_desc
,
p_in_block
,
f_copy
);
// copy weight tensor to LDS
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
wei_desc
),
decltype
(
wei_block_desc
),
1
,
1
,
1
,
64
,
decltype
(
f_copy
)
>
(
wei_desc
,
p_wei
+
wei_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
wei_block_desc
,
p_wei_block
,
f_copy
);
// copy output tensor to LDS
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
out_desc
),
decltype
(
out_block_desc
),
1
,
1
,
1
,
64
,
decltype
(
f_copy
)
>
(
out_desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
out_block_desc
,
p_out_block
,
f_copy
);
__syncthreads
();
// blockwise convolution
blockwise_convolution
<
TFloat
,
decltype
(
in_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
out_block_desc
),
S
,
R
,
InTileSizeH
,
InTileSizeW
,
OutTileSizeH
,
OutTileSizeW
,
NPerBlock
,
KPerBlock
,
YPerBlock
,
XPerBlock
,
CPerBlockLoop
>
(
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
__syncthreads
();
// accum output tensor from LDS to device mem
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
out_block_desc
),
decltype
(
out_desc
),
1
,
1
,
1
,
64
,
decltype
(
f_copy
)
>
(
out_block_desc
,
p_out_block
,
out_desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
f_copy
);
}
}
src/include/constant_tensor_descriptor.cuh
0 → 100644
View file @
1b648f2f
#pragma once
#include "helper_cuda.h"
template
<
class
T
,
T
N
>
struct
Constant
{
const
T
mValue
=
N
;
};
template
<
unsigned
I
>
using
Index
=
Constant
<
unsigned
,
I
>
;
template
<
unsigned
...
Is
>
struct
Sequence
{
static
constexpr
unsigned
nDim
=
sizeof
...(
Is
);
const
unsigned
mData
[
nDim
]
=
{
Is
...};
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
Get
(
Index
<
I
>
)
const
{
return
mData
[
I
];
}
};
#if 0
template<class F, class T, T... Is>
void for_each(F f, std::integer_sequence<T, Is...>)
{
f(Is)...;
}
template<class F, class T, T N>
void for_n_time(F f, Constant<T, N>)
{
for_each(f, std::make_integer_sequence<T, N>{});
}
#endif
template
<
class
Lengths
,
class
Strides
>
struct
ConstantTensorDescriptor
{
static
constexpr
unsigned
nDim
=
Lengths
::
nDim
;
using
NDimConstant
=
Index
<
nDim
>
;
__host__
__device__
constexpr
ConstantTensorDescriptor
()
{
static_assert
(
Lengths
::
nDim
==
Strides
::
nDim
,
"nDim not consistent"
);
}
__host__
__device__
constexpr
unsigned
GetDimension
()
const
{
return
nDim
;
}
__host__
__device__
constexpr
Lengths
GetLengths
()
const
{
return
Lengths
{};
}
__host__
__device__
constexpr
Strides
GetStrides
()
const
{
return
Strides
{};
}
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
GetLength
(
Index
<
I
>
)
const
{
return
Lengths
{}.
Get
(
Index
<
I
>
{});
}
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
GetStride
(
Index
<
I
>
)
const
{
return
Strides
{}.
Get
(
Index
<
I
>
{});
}
#if 0
template <class... Is>
__host__ __device__ unsigned Get1dIndex(Is... is) const
{
static_assert(nDim == sizeof...(Is), "nDim not consistent");
const unsigned iss[nDim] = {static_cast<unsigned>(is)...};
unsigned idx = 0;
for_n_time([&](auto iDim) { idx += iss[iDim] * GetStride<iDim>(); }, NDimConstant{});
return idx;
}
#elif
1
// this is ugly, only for 4d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
static_assert
(
nDim
==
4
,
"nDim not consistent"
);
return
n
*
GetStride
(
I0
)
+
c
*
GetStride
(
I1
)
+
h
*
GetStride
(
I2
)
+
w
*
GetStride
(
I3
);
}
#endif
};
// this is ugly, only for 4d
template
<
unsigned
N
,
unsigned
C
,
unsigned
H
,
unsigned
W
>
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
N
,
C
,
H
,
W
>
)
{
return
Sequence
<
C
*
H
*
W
,
H
*
W
,
W
,
1
>
{};
}
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
(
Lengths
)
{
using
Strides
=
decltype
(
calculate_default_strides
(
Lengths
{}));
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
template
<
class
Lengths
,
class
Strides
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
(
Lengths
,
Strides
)
{
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
__host__
__device__
constexpr
auto
get_output_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
static_assert
(
in_desc
.
GetDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
GetDimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
in_desc
.
GetLength
(
I1
)
==
wei_desc
.
GetLength
(
I1
),
"input & weight dimension not consistent"
);
constexpr
auto
N
=
in_desc
.
GetLength
(
I0
);
constexpr
auto
HI
=
in_desc
.
GetLength
(
I2
);
constexpr
auto
WI
=
in_desc
.
GetLength
(
I3
);
constexpr
auto
K
=
wei_desc
.
GetLength
(
I0
);
constexpr
auto
S
=
wei_desc
.
GetLength
(
I2
);
constexpr
auto
R
=
wei_desc
.
GetLength
(
I3
);
constexpr
auto
HO
=
HI
-
S
+
1
;
constexpr
auto
WO
=
WI
-
R
+
1
;
return
make_ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
// this is ugly, only for 4d
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
)
{
constexpr
auto
desc
=
TDesc
{};
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
static_assert
(
desc
.
GetDimension
()
==
4
,
"dim is not 4"
);
printf
(
"dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}
\n
"
,
desc
.
GetDimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetLength
(
I2
),
desc
.
GetLength
(
I3
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
));
}
src/include/device_tensor.cuh
→
src/include/device_tensor
_descriptor
.cuh
View file @
1b648f2f
#pragma once
#pragma once
#include <algorithm>
#include <algorithm>
#include "constant_tensor_descriptor.cuh"
#include "helper_cuda.h"
#include "helper_cuda.h"
#include "tensor.hpp"
#include "tensor.hpp"
...
@@ -19,7 +20,7 @@ struct DeviceTensorDescriptor
...
@@ -19,7 +20,7 @@ struct DeviceTensorDescriptor
__host__
__device__
unsigned
GetStride
(
unsigned
i
)
const
{
return
mpStrides
[
i
];
}
__host__
__device__
unsigned
GetStride
(
unsigned
i
)
const
{
return
mpStrides
[
i
];
}
// this is ugly
// this is ugly
, only for 4d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
{
{
return
n
*
mpStrides
[
0
]
+
c
*
mpStrides
[
1
]
+
h
*
mpStrides
[
2
]
+
w
*
mpStrides
[
3
];
return
n
*
mpStrides
[
0
]
+
c
*
mpStrides
[
1
]
+
h
*
mpStrides
[
2
]
+
w
*
mpStrides
[
3
];
...
@@ -28,3 +29,32 @@ struct DeviceTensorDescriptor
...
@@ -28,3 +29,32 @@ struct DeviceTensorDescriptor
unsigned
mpLengths
[
NDim
];
unsigned
mpLengths
[
NDim
];
unsigned
mpStrides
[
NDim
];
unsigned
mpStrides
[
NDim
];
};
};
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
__host__
__device__
auto
make_DeviceTensorDescriptor
(
TConstTensorDesc
)
{
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
const_desc
=
TConstTensorDesc
{};
constexpr
auto
ndim
=
const_desc
.
GetDimension
();
auto
desc
=
DeviceTensorDescriptor
<
ndim
>
{};
desc
.
mpLengths
[
0
]
=
const_desc
.
GetLength
(
I0
);
desc
.
mpLengths
[
1
]
=
const_desc
.
GetLength
(
I1
);
desc
.
mpLengths
[
2
]
=
const_desc
.
GetLength
(
I2
);
desc
.
mpLengths
[
3
]
=
const_desc
.
GetLength
(
I3
);
desc
.
mpStrides
[
0
]
=
const_desc
.
GetStride
(
I0
);
desc
.
mpStrides
[
1
]
=
const_desc
.
GetStride
(
I1
);
desc
.
mpStrides
[
2
]
=
const_desc
.
GetStride
(
I2
);
desc
.
mpStrides
[
3
]
=
const_desc
.
GetStride
(
I3
);
return
desc
;
}
src/include/direct_convolution.cuh
View file @
1b648f2f
#pragma once
#pragma once
#include "device_tensor.cuh"
#include "device_tensor
_descriptor
.cuh"
template
<
class
TFloat
,
template
<
class
TFloat
,
unsigned
NWorkLen0
,
unsigned
NWorkLen0
,
...
@@ -13,7 +13,7 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
...
@@ -13,7 +13,7 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
TFloat
*
__restrict__
p_dst
,
TFloat
*
__restrict__
p_dst
,
F
f
)
F
f
)
{
{
#if
1
#if
0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
printf("blockwise_4d_tensor_op: 0: \t"
printf("blockwise_4d_tensor_op: 0: \t"
...
@@ -80,7 +80,7 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
...
@@ -80,7 +80,7 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc
f
(
p_src
[
dindex
],
p_dst
[
sindex
]);
f
(
p_src
[
dindex
],
p_dst
[
sindex
]);
#if
1
#if
0
// if(threadIdx.x == 0)
// if(threadIdx.x == 0)
{
{
printf("blockwise_4d_tensor_op: 1: thread id %u, \t"
printf("blockwise_4d_tensor_op: 1: thread id %u, \t"
...
@@ -106,7 +106,7 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
...
@@ -106,7 +106,7 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
TFloat
*
__restrict__
p_dst
,
TFloat
*
__restrict__
p_dst
,
F
f
)
F
f
)
{
{
#if
1
#if
0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
printf("threadwise_4d_tensor_op: 0: \t"
printf("threadwise_4d_tensor_op: 0: \t"
...
@@ -151,7 +151,7 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
...
@@ -151,7 +151,7 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if
1
#if
0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
...
@@ -178,7 +178,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
...
@@ -178,7 +178,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
)
{
{
#if
1
#if
0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
printf("threadwise_direct_convolution: 0: \t"
printf("threadwise_direct_convolution: 0: \t"
...
@@ -212,7 +212,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
...
@@ -212,7 +212,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
out_desc.GetStride(2),
out_desc.GetStride(2),
out_desc.GetStride(3));
out_desc.GetStride(3));
}
}
#elif
1
#elif
0
{
{
printf
(
"threadwise_direct_convolution: 0:
\t
"
printf
(
"threadwise_direct_convolution: 0:
\t
"
"threadIdx.x %u
\t
"
"threadIdx.x %u
\t
"
...
@@ -275,7 +275,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
...
@@ -275,7 +275,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i
p_out
[
out_index
]
+=
p_wei
[
wei_index
]
*
p_in
[
in_index
];
p_out
[
out_index
]
+=
p_wei
[
wei_index
]
*
p_in
[
in_index
];
#if
1
#if
0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
printf("threadwise_direct_convolution: 1: \t"
printf("threadwise_direct_convolution: 1: \t"
...
@@ -320,7 +320,7 @@ __device__ void blockwise_convolution(const DeviceTensorDescriptor<4>& in_desc,
...
@@ -320,7 +320,7 @@ __device__ void blockwise_convolution(const DeviceTensorDescriptor<4>& in_desc,
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
const
DeviceTensorDescriptor
<
4
>&
out_desc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
)
{
{
#if
1
#if
0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
printf("blockwise_convolution: 0: \t"
printf("blockwise_convolution: 0: \t"
...
@@ -501,7 +501,7 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
...
@@ -501,7 +501,7 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc,
const
DeviceTensorDescriptor
<
4
>
out_desc
,
const
DeviceTensorDescriptor
<
4
>
out_desc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
)
{
{
#if
1
#if
0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
printf("gridwise_convolution: 0: \t"
printf("gridwise_convolution: 0: \t"
...
...
src/include/tensor.hpp
View file @
1b648f2f
...
@@ -69,28 +69,25 @@ auto construct_f_unpack_args(F, T args)
...
@@ -69,28 +69,25 @@ auto construct_f_unpack_args(F, T args)
struct
TensorDescriptor
struct
TensorDescriptor
{
{
TensorDescriptor
()
=
delete
;
TensorDescriptor
()
=
delete
;
TensorDescriptor
(
DataType_t
t
,
std
::
initializer_list
<
std
::
size_t
>
lens
);
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
);
TensorDescriptor
(
DataType_t
t
,
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
,
std
::
initializer_list
<
std
::
size_t
>
lens
,
std
::
initializer_list
<
std
::
size_t
>
strides
);
std
::
initializer_list
<
std
::
size_t
>
strides
);
TensorDescriptor
(
DataType_t
t
,
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
);
TensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
);
void
CalculateStrides
();
void
CalculateStrides
();
template
<
class
Range
>
template
<
class
Range
>
TensorDescriptor
(
DataType_t
t
,
const
Range
&
lens
)
TensorDescriptor
(
const
Range
&
lens
)
:
mLens
(
lens
.
begin
(),
lens
.
end
())
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mDataType
(
t
)
{
{
this
->
CalculateStrides
();
this
->
CalculateStrides
();
}
}
template
<
class
Range1
,
class
Range2
>
template
<
class
Range1
,
class
Range2
>
TensorDescriptor
(
DataType_t
t
,
const
Range1
&
lens
,
const
Range2
&
strides
)
TensorDescriptor
(
const
Range1
&
lens
,
const
Range2
&
strides
)
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mStrides
(
strides
.
begin
(),
strides
.
end
())
,
mDataType
(
t
)
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mStrides
(
strides
.
begin
(),
strides
.
end
())
{
{
}
}
DataType_t
GetDataType
()
const
;
std
::
size_t
GetDimension
()
const
;
std
::
size_t
GetDimension
()
const
;
std
::
size_t
GetElementSize
()
const
;
std
::
size_t
GetElementSize
()
const
;
std
::
size_t
GetElementSpace
()
const
;
std
::
size_t
GetElementSpace
()
const
;
...
@@ -107,7 +104,6 @@ struct TensorDescriptor
...
@@ -107,7 +104,6 @@ struct TensorDescriptor
}
}
private:
private:
DataType_t
mDataType
;
std
::
vector
<
std
::
size_t
>
mLens
;
std
::
vector
<
std
::
size_t
>
mLens
;
std
::
vector
<
std
::
size_t
>
mStrides
;
std
::
vector
<
std
::
size_t
>
mStrides
;
};
};
...
@@ -220,22 +216,23 @@ template <class T>
...
@@ -220,22 +216,23 @@ template <class T>
struct
Tensor
struct
Tensor
{
{
template
<
class
X
>
template
<
class
X
>
Tensor
(
std
::
initializer_list
<
X
>
lens
)
Tensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpace
())
:
mDesc
(
DataType
<
T
>
{},
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
{
}
}
template
<
class
X
>
template
<
class
X
>
Tensor
(
std
::
vector
<
X
>
lens
)
:
mDesc
(
DataType
<
T
>
{},
lens
),
mData
(
mDesc
.
GetElementSpace
())
Tensor
(
std
::
vector
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
{
}
}
template
<
class
X
,
class
Y
>
template
<
class
X
,
class
Y
>
Tensor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
)
Tensor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
)
:
mDesc
(
DataType
<
T
>
{},
lens
,
strides
),
mData
(
mDesc
.
GetElementSpace
())
:
mDesc
(
lens
,
strides
),
mData
(
mDesc
.
GetElementSpace
())
{
{
}
}
Tensor
(
const
TensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
template
<
class
G
>
template
<
class
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
{
{
...
...
src/tensor.cpp
View file @
1b648f2f
...
@@ -3,16 +3,13 @@
...
@@ -3,16 +3,13 @@
#include "tensor.hpp"
#include "tensor.hpp"
TensorDescriptor
::
TensorDescriptor
(
DataType_t
t
,
std
::
initializer_list
<
std
::
size_t
>
lens
)
TensorDescriptor
::
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
)
:
mLens
(
lens
)
:
mLens
(
lens
),
mDataType
(
t
)
{
{
this
->
CalculateStrides
();
this
->
CalculateStrides
();
}
}
TensorDescriptor
::
TensorDescriptor
(
DataType_t
t
,
TensorDescriptor
::
TensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
)
std
::
vector
<
std
::
size_t
>
lens
,
:
mLens
(
lens
),
mStrides
(
strides
)
std
::
vector
<
std
::
size_t
>
strides
)
:
mLens
(
lens
),
mStrides
(
strides
),
mDataType
(
t
)
{
{
}
}
...
@@ -28,8 +25,6 @@ void TensorDescriptor::CalculateStrides()
...
@@ -28,8 +25,6 @@ void TensorDescriptor::CalculateStrides()
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
}
}
DataType_t
TensorDescriptor
::
GetDataType
()
const
{
return
mDataType
;
}
std
::
size_t
TensorDescriptor
::
GetDimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
TensorDescriptor
::
GetDimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
TensorDescriptor
::
GetElementSize
()
const
std
::
size_t
TensorDescriptor
::
GetElementSize
()
const
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment