Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6a45afba
"...composable_kernel-1.git" did not exist on "acbd7bd7c5efd17b7061157a5868e28acc04d33e"
Commit
6a45afba
authored
Nov 04, 2018
by
Chao Liu
Browse files
conv: update tensorDesc calculation
parent
1b648f2f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
125 additions
and
55 deletions
+125
-55
driver/conv.cu
driver/conv.cu
+82
-9
src/include/constant_direct_convolution.cuh
src/include/constant_direct_convolution.cuh
+43
-46
No files found.
driver/conv.cu
View file @
6a45afba
...
@@ -111,9 +111,10 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
...
@@ -111,9 +111,10 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
f_par
(
std
::
thread
::
hardware_concurrency
());
f_par
(
std
::
thread
::
hardware_concurrency
());
}
}
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
#if 0
template <class T>
void device_convolution(
void device_convolution(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
{
{
DeviceTensorDescriptor<4> in_desc_device(in.mDesc);
DeviceTensorDescriptor<4> in_desc_device(in.mDesc);
DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc);
DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc);
...
@@ -144,7 +145,7 @@ void device_convolution(
...
@@ -144,7 +145,7 @@ void device_convolution(
dim3 block_dim(64, 1, 1);
dim3 block_dim(64, 1, 1);
dim3 grid_dim(1, 1, 1);
dim3 grid_dim(1, 1, 1);
#if 0
gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
<<<grid_dim, block_dim>>>(in_desc_device,
<<<grid_dim, block_dim>>>(in_desc_device,
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
...
@@ -152,19 +153,78 @@ void device_convolution(
...
@@ -152,19 +153,78 @@ void device_convolution(
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
out_desc_device,
out_desc_device,
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data());
}
#else
#else
gridwise_convolution
<
T
,
InDesc
,
WeiDesc
,
OutDesc
,
4
,
4
,
2
,
2
,
1
,
1
,
8
,
8
,
1
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
const_device_convolution
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
out
.
GenerateTensorValue
(
GeneratorConstant
<
float
>
{
0
},
num_thread
);
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
dim3
block_dim
(
64
,
1
,
1
);
dim3
grid_dim
(
1
,
1
,
1
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
1
;
constexpr
unsigned
CPerBlockLoop
=
1
;
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
YPerBlock
=
(
out_desc
.
GetLength
(
I2
)
+
OutTileSizeH
-
1
)
/
OutTileSizeH
;
constexpr
unsigned
XPerBlock
=
(
out_desc
.
GetLength
(
I3
)
+
OutTileSizeW
-
1
)
/
OutTileSizeW
;
constexpr
unsigned
NBlockCopyLen0
=
1
;
constexpr
unsigned
NBlockCopyLen1
=
1
;
constexpr
unsigned
NBlockCopyLen2
=
1
;
constexpr
unsigned
NBlockCopyLen3
=
64
;
gridwise_convolution
<
T
,
InDesc
,
WeiDesc
,
OutDesc
,
NPerBlock
,
KPerBlock
,
CPerBlockLoop
,
OutTileSizeH
,
OutTileSizeW
,
YPerBlock
,
XPerBlock
,
NBlockCopyLen0
,
NBlockCopyLen1
,
NBlockCopyLen2
,
NBlockCopyLen3
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
WeiDesc
{},
WeiDesc
{},
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
OutDesc
{},
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
#endif
checkCudaErrors
(
cudaGetLastError
());
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
}
#endif
int
main
()
int
main
()
{
{
...
@@ -176,14 +236,22 @@ int main()
...
@@ -176,14 +236,22 @@ int main()
constexpr
unsigned
K
=
1
;
constexpr
unsigned
K
=
1
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
R
=
3
;
constexpr
unsigned
R
=
3
;
#elif
0
#elif
1
constexpr
unsigned
N
=
1
;
constexpr
unsigned
N
=
1
;
constexpr
unsigned
C
=
1
;
constexpr
unsigned
C
=
1
;
constexpr
unsigned
HI
=
130
;
constexpr
unsigned
HI
=
36
;
constexpr
unsigned
WI
=
130
;
constexpr
unsigned
WI
=
36
;
constexpr
unsigned
K
=
1
;
constexpr
unsigned
K
=
1
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
R
=
3
;
constexpr
unsigned
R
=
3
;
#elif 0
constexpr
unsigned
N
=
1
;
constexpr
unsigned
C
=
1
;
constexpr
unsigned
HI
=
130
;
constexpr
unsigned
WI
=
130
;
constexpr
unsigned
K
=
1
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
R
=
3
;
#elif 0
#elif 0
constexpr
unsigned
N
=
3
;
constexpr
unsigned
N
=
3
;
constexpr
unsigned
C
=
16
;
constexpr
unsigned
C
=
16
;
...
@@ -214,7 +282,12 @@ int main()
...
@@ -214,7 +282,12 @@ int main()
wei
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor
<
float
>
{},
num_thread
);
host_convolution
(
in
,
wei
,
out_host
);
host_convolution
(
in
,
wei
,
out_host
);
device_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#if 0
device_convolution(in, wei, out_device);
#else
const_device_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#endif
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
...
...
src/include/constant_direct_convolution.cuh
View file @
6a45afba
...
@@ -231,17 +231,8 @@ template <class TFloat,
...
@@ -231,17 +231,8 @@ template <class TFloat,
class
InDesc
,
class
InDesc
,
class
WeiDesc
,
class
WeiDesc
,
class
OutDesc
,
class
OutDesc
,
unsigned
S
,
unsigned
R
,
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
OutTileSizeW
>
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
CPerBlockLoop
>
__device__
void
blockwise_convolution
(
InDesc
,
__device__
void
blockwise_convolution
(
InDesc
,
TFloat
*
const
__restrict__
p_in
,
TFloat
*
const
__restrict__
p_in
,
WeiDesc
,
WeiDesc
,
...
@@ -258,6 +249,19 @@ __device__ void blockwise_convolution(InDesc,
...
@@ -258,6 +249,19 @@ __device__ void blockwise_convolution(InDesc,
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
unsigned
S
=
wei_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_desc
.
GetLength
(
I3
);
constexpr
unsigned
NPerBlock
=
out_desc
.
GetLength
(
I0
);
constexpr
unsigned
KPerBlock
=
out_desc
.
GetLength
(
I1
);
constexpr
unsigned
YPerBlock
=
(
out_desc
.
GetLength
(
I2
)
+
OutTileSizeH
-
1
)
/
OutTileSizeH
;
constexpr
unsigned
XPerBlock
=
(
out_desc
.
GetLength
(
I3
)
+
OutTileSizeW
-
1
)
/
OutTileSizeW
;
constexpr
unsigned
CPerBlockLoop
=
in_desc
.
GetLength
(
I1
);
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
InTileSizeW
=
OutTileSizeW
+
R
-
1
;
#if 1
#if 1
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
{
{
...
@@ -383,15 +387,17 @@ template <class TFloat,
...
@@ -383,15 +387,17 @@ template <class TFloat,
class
InDesc
,
class
InDesc
,
class
WeiDesc
,
class
WeiDesc
,
class
OutDesc
,
class
OutDesc
,
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlockLoop
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
YPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
XPerBlock
,
unsigned
CPerBlockLoop
>
unsigned
NBlockCopyLen0
,
unsigned
NBlockCopyLen1
,
unsigned
NBlockCopyLen2
,
unsigned
NBlockCopyLen3
>
__global__
void
gridwise_convolution
(
InDesc
,
__global__
void
gridwise_convolution
(
InDesc
,
TFloat
*
const
__restrict__
p_in
,
TFloat
*
const
__restrict__
p_in
,
WeiDesc
,
WeiDesc
,
...
@@ -420,11 +426,10 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -420,11 +426,10 @@ __global__ void gridwise_convolution(InDesc,
}
}
#endif
#endif
constexpr
unsigned
NBlockWork
=
(
in_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
NBlockWork
=
(
out_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
YBlockWork
=
(
in_desc
.
GetLength
(
I2
)
+
YPerBlock
-
1
)
/
YPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
XBlockWork
=
(
in_desc
.
GetLength
(
I3
)
+
XPerBlock
-
1
)
/
XPerBlock
;
constexpr
unsigned
YBlockWork
=
(
out_desc
.
GetLength
(
I2
)
+
YPerBlock
-
1
)
/
YPerBlock
;
constexpr
unsigned
XBlockWork
=
(
out_desc
.
GetLength
(
I3
)
+
XPerBlock
-
1
)
/
XPerBlock
;
constexpr
unsigned
KBlockWork
=
(
wei_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
const
unsigned
block_id
=
const
unsigned
block_id
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
+
blockIdx
.
z
*
(
gridDim
.
y
*
gridDim
.
x
);
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
+
blockIdx
.
z
*
(
gridDim
.
y
*
gridDim
.
x
);
...
@@ -434,6 +439,7 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -434,6 +439,7 @@ __global__ void gridwise_convolution(InDesc,
CPerBlockLoop
,
CPerBlockLoop
,
YPerBlock
*
OutTileSizeH
+
S
-
1
,
YPerBlock
*
OutTileSizeH
+
S
-
1
,
XPerBlock
*
OutTileSizeW
+
R
-
1
>
{});
XPerBlock
*
OutTileSizeW
+
R
-
1
>
{});
constexpr
auto
wei_block_desc
=
constexpr
auto
wei_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlockLoop
,
S
,
R
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlockLoop
,
S
,
R
>
{});
...
@@ -474,10 +480,10 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -474,10 +480,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op
<
TFloat
,
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
in_desc
),
decltype
(
in_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
1
,
NBlockCopyLen0
,
1
,
NBlockCopyLen
1
,
1
,
NBlockCopyLen2
,
64
,
NBlockCopyLen3
,
decltype
(
f_copy
)
>
(
in_desc
,
decltype
(
f_copy
)
>
(
in_desc
,
p_in
+
in_desc
.
Get1dIndex
(
n_block_work_begin
,
p_in
+
in_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
c_block_work_begin
,
...
@@ -491,10 +497,10 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -491,10 +497,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op
<
TFloat
,
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
wei_desc
),
decltype
(
wei_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
1
,
NBlockCopyLen0
,
1
,
NBlockCopyLen
1
,
1
,
NBlockCopyLen2
,
64
,
NBlockCopyLen3
,
decltype
(
f_copy
)
>
(
decltype
(
f_copy
)
>
(
wei_desc
,
wei_desc
,
p_wei
+
wei_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
p_wei
+
wei_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
...
@@ -506,10 +512,10 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -506,10 +512,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op
<
TFloat
,
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
out_desc
),
decltype
(
out_desc
),
decltype
(
out_block_desc
),
decltype
(
out_block_desc
),
1
,
NBlockCopyLen0
,
1
,
NBlockCopyLen
1
,
1
,
NBlockCopyLen2
,
64
,
NBlockCopyLen3
,
decltype
(
f_copy
)
>
(
out_desc
,
decltype
(
f_copy
)
>
(
out_desc
,
p_out
+
out_desc
.
Get1dIndex
(
n_block_work_begin
,
p_out
+
out_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
k_block_work_begin
,
...
@@ -526,17 +532,8 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -526,17 +532,8 @@ __global__ void gridwise_convolution(InDesc,
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
out_block_desc
),
decltype
(
out_block_desc
),
S
,
R
,
InTileSizeH
,
InTileSizeW
,
OutTileSizeH
,
OutTileSizeH
,
OutTileSizeW
,
OutTileSizeW
>
(
NPerBlock
,
KPerBlock
,
YPerBlock
,
XPerBlock
,
CPerBlockLoop
>
(
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
__syncthreads
();
__syncthreads
();
...
@@ -545,10 +542,10 @@ __global__ void gridwise_convolution(InDesc,
...
@@ -545,10 +542,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op
<
TFloat
,
blockwise_4d_tensor_op
<
TFloat
,
decltype
(
out_block_desc
),
decltype
(
out_block_desc
),
decltype
(
out_desc
),
decltype
(
out_desc
),
1
,
NBlockCopyLen0
,
1
,
NBlockCopyLen
1
,
1
,
NBlockCopyLen2
,
64
,
NBlockCopyLen3
,
decltype
(
f_copy
)
>
(
out_block_desc
,
decltype
(
f_copy
)
>
(
out_block_desc
,
p_out_block
,
p_out_block
,
out_desc
,
out_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment