Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
fee92fb6
Commit
fee92fb6
authored
Nov 26, 2018
by
Chao Liu
Browse files
changed direct conv
parent
24d2f034
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
454 additions
and
173 deletions
+454
-173
driver/conv.cu
driver/conv.cu
+10
-16
driver/device_direct_convolution_1.cuh
driver/device_direct_convolution_1.cuh
+9
-11
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+88
-80
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+16
-13
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+41
-45
src/include/gridwise_winograd_convolution.cuh
src/include/gridwise_winograd_convolution.cuh
+237
-0
src/include/threadwise_direct_convolution.cuh
src/include/threadwise_direct_convolution.cuh
+4
-4
src/include/threadwise_tensor_op.cuh
src/include/threadwise_tensor_op.cuh
+49
-4
No files found.
driver/conv.cu
View file @
fee92fb6
...
...
@@ -7,26 +7,16 @@
#include "constant_tensor_descriptor.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
//#include "device_winograd_convolution.cuh"
struct
GeneratorConstant
{
double
value
=
0
;
template
<
class
...
Is
>
double
operator
()(
Is
...)
{
return
value
;
}
};
struct
GeneratorTensor
struct
GeneratorTensor_1
{
template
<
class
...
Is
>
double
operator
()(
Is
...
is
)
{
#if
1
#if
0
return double(std::rand()) / double(RAND_MAX);
#elif
0
#elif
1
return
1
;
#elif 0
std
::
initializer_list
<
std
::
size_t
>
ls
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
...
...
@@ -395,7 +385,11 @@ int main()
Tensor
<
float
>
out_host
(
make_TensorDescriptor
(
out_desc
));
Tensor
<
float
>
out_device
(
make_TensorDescriptor
(
out_desc
));
#if 1
#if 0
std::size_t num_thread = std::thread::hardware_concurrency();
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
0
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
...
...
@@ -410,7 +404,7 @@ int main()
#endif
}
#if
1
#if
0
host_winograd_3x3_convolution(in, wei, out_host);
check_error(out_host, out_device);
#elif
0
...
...
driver/device_direct_convolution_1.cuh
View file @
fee92fb6
...
...
@@ -27,15 +27,14 @@ void device_direct_convolution_1(
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
4
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
YPerBlock
=
8
;
constexpr
unsigned
KPerBlock
=
16
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
YPerBlock
=
4
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
NBlockOpLen0
=
1
;
constexpr
unsigned
NBlockOpLen1
=
1
;
constexpr
unsigned
NBlockOpLen2
=
4
;
constexpr
unsigned
NBlockOpLen3
=
32
;
constexpr
unsigned
NPerThread
=
1
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
...
...
@@ -66,10 +65,9 @@ void device_direct_convolution_1(
CPerBlock
,
YPerBlock
,
XPerBlock
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
NPerThread
,
KPerThread
,
CPerThread
,
BlockSize
,
GridSize
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
...
...
src/include/blockwise_direct_convolution.cuh
View file @
fee92fb6
...
...
@@ -9,8 +9,11 @@ template <class TFloat,
class
OutBlockDesc
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
BlockSize
>
__device__
void
blockwise_convolution
(
InBlockDesc
,
__device__
void
blockwise_
direct_
convolution
(
InBlockDesc
,
TFloat
*
const
__restrict__
p_in_block
,
WeiBlockDesc
,
TFloat
*
const
__restrict__
p_wei_block
,
...
...
@@ -29,16 +32,17 @@ __device__ void blockwise_convolution(InBlockDesc,
constexpr
unsigned
S
=
wei_block_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_block_desc
.
GetLength
(
I3
);
constexpr
unsigned
NPerBlock
=
out_block_desc
.
GetLength
(
I0
);
constexpr
unsigned
KPerBlock
=
out_block_desc
.
GetLength
(
I1
);
constexpr
unsigned
YPerBlock
=
(
out_block_desc
.
GetLength
(
I2
)
+
OutTileSizeH
-
1
)
/
OutTileSizeH
;
constexpr
unsigned
XPerBlock
=
(
out_block_desc
.
GetLength
(
I3
)
+
OutTileSizeW
-
1
)
/
OutTileSizeW
;
constexpr
unsigned
CPerBlock
=
in_block_desc
.
GetLength
(
I1
);
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
InTileSizeW
=
OutTileSizeW
+
R
-
1
;
// divide thread work
constexpr
unsigned
NThreadWork
=
(
out_block_desc
.
GetLength
(
I0
)
+
NPerThread
-
1
)
/
NPerThread
;
constexpr
unsigned
KThreadWork
=
(
out_block_desc
.
GetLength
(
I1
)
+
KPerThread
-
1
)
/
KPerThread
;
constexpr
unsigned
YThreadWork
=
(
out_block_desc
.
GetLength
(
I2
)
+
OutTileSizeH
-
1
)
/
OutTileSizeH
;
constexpr
unsigned
XThreadWork
=
(
out_block_desc
.
GetLength
(
I3
)
+
OutTileSizeW
-
1
)
/
OutTileSizeW
;
#if 0
if(threadIdx.x == 0)
{
...
...
@@ -48,90 +52,94 @@ __device__ void blockwise_convolution(InBlockDesc,
}
#endif
constexpr
auto
in_thread_
src_
desc
=
make_ConstantTensorDescriptor
(
Sequence
<
1
,
CPerBlock
,
InTileSizeH
,
InTileSizeW
>
{}
,
in_block_desc
.
GetStrides
()
);
constexpr
auto
in_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
InTileSizeH
,
InTileSizeW
>
{});
constexpr
auto
wei_thread_
src_
desc
=
make_ConstantTensorDescriptor
(
Sequence
<
1
,
CPerBlock
,
S
,
R
>
{},
wei_block_desc
.
GetStrides
()
);
constexpr
auto
wei_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{}
);
constexpr
auto
out_thread_
src_
desc
=
make_ConstantTensorDescriptor
(
Sequence
<
1
,
1
,
OutTileSizeH
,
OutTileSizeW
>
{}
,
out_block_desc
.
GetStrides
()
);
constexpr
auto
out_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
KPerThread
,
OutTileSizeH
,
OutTileSizeW
>
{});
constexpr
auto
in_thread_
dst
_desc
=
make_ConstantTensorDescriptor
(
in_thread_
src_
desc
.
GetLengths
());
constexpr
auto
in_thread_
block
_desc
=
make_ConstantTensorDescriptor
(
in_thread_desc
.
GetLengths
()
,
in_block_desc
.
GetStrides
()
);
constexpr
auto
wei_thread_
dst
_desc
=
make_ConstantTensorDescriptor
(
wei_thread_
src_
desc
.
GetLengths
());
constexpr
auto
wei_thread_
block
_desc
=
make_ConstantTensorDescriptor
(
wei_thread_desc
.
GetLengths
()
,
wei_block_desc
.
GetStrides
()
);
constexpr
auto
out_thread_
dst
_desc
=
make_ConstantTensorDescriptor
(
out_thread_
src_
desc
.
GetLengths
());
constexpr
auto
out_thread_
block
_desc
=
make_ConstantTensorDescriptor
(
out_thread_desc
.
GetLengths
()
,
out_block_desc
.
GetStrides
()
);
const
unsigned
thread_id
=
threadIdx
.
x
;
for
(
unsigned
thread_work_id
=
thread_id
;
thread_work_id
<
NPerBlock
*
YPerBlock
*
XPerBlock
;
for
(
unsigned
thread_work_id
=
thread_id
;
thread_work_id
<
NThreadWork
*
KThreadWork
*
YThreadWork
*
XThreadWork
;
thread_work_id
+=
BlockSize
)
{
unsigned
itmp
=
thread_work_id
;
unsigned
n_thread_work_id
=
itmp
/
(
YPerBlock
*
XPerBlock
);
itmp
-=
n_thread_work_id
*
(
YPerBlock
*
XPerBlock
);
unsigned
y_thread_work_id
=
itmp
/
XPerBlock
;
unsigned
x_thread_work_id
=
itmp
-
y_thread_work_id
*
XPerBlock
;
unsigned
n_thread_work_begin
=
n_thread_work_id
*
1
;
unsigned
ho_thread_work_begin
=
y_thread_work_id
*
OutTileSizeH
;
unsigned
wo_thread_work_begin
=
x_thread_work_id
*
OutTileSizeW
;
unsigned
hi_thread_work_begin
=
ho_thread_work_begin
;
// minus padding
unsigned
wi_thread_work_begin
=
wo_thread_work_begin
;
// minus padding
TFloat
p_in_thread
[
in_thread_src_desc
.
GetElementSpace
()];
TFloat
p_wei_thread
[
wei_thread_src_desc
.
GetElementSpace
()];
TFloat
p_out_thread
[
out_thread_src_desc
.
GetElementSpace
()];
unsigned
n_thread_work_id
=
itmp
/
(
KThreadWork
*
YThreadWork
*
XThreadWork
);
itmp
-=
n_thread_work_id
*
(
KThreadWork
*
YThreadWork
*
XThreadWork
);
unsigned
k_thread_work_id
=
itmp
/
(
YThreadWork
*
XThreadWork
);
itmp
-=
k_thread_work_id
*
(
YThreadWork
*
XThreadWork
);
unsigned
y_thread_work_id
=
itmp
/
XThreadWork
;
unsigned
x_thread_work_id
=
itmp
-
y_thread_work_id
*
XThreadWork
;
unsigned
n_thread_data_begin
=
n_thread_work_id
*
NPerThread
;
unsigned
k_thread_data_begin
=
k_thread_work_id
*
KPerThread
;
unsigned
ho_thread_data_begin
=
y_thread_work_id
*
OutTileSizeH
;
unsigned
wo_thread_data_begin
=
x_thread_work_id
*
OutTileSizeW
;
unsigned
hi_thread_data_begin
=
ho_thread_data_begin
;
// minus padding
unsigned
wi_thread_data_begin
=
wo_thread_data_begin
;
// minus padding
TFloat
p_in_thread
[
in_thread_desc
.
GetElementSpace
()];
TFloat
p_wei_thread
[
wei_thread_desc
.
GetElementSpace
()];
TFloat
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
threadwise_4d_tensor_copy
(
out_thread_block_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
out_thread_desc
,
p_out_thread
);
// copy input tensor into register
threadwise_4d_tensor_copy
(
in_thread_src_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
in_thread_dst_desc
,
for
(
unsigned
c_thread_data_begin
=
0
;
c_thread_data_begin
<
in_block_desc
.
GetLength
(
I1
);
c_thread_data_begin
+=
CPerThread
)
{
// copy input into register
threadwise_4d_tensor_copy
(
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
in_thread_desc
,
p_in_thread
);
for
(
unsigned
k_thread_work_begin
=
0
;
k_thread_work_begin
<
KPerBlock
;
++
k_thread_work_begin
)
{
// copy weight tensor into register
threadwise_4d_tensor_copy
(
wei_thread_src_desc
,
// copy weight into register
threadwise_4d_tensor_copy
(
wei_thread_block_desc
,
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_
work
_begin
,
0
,
0
,
0
),
wei_thread_
dst_
desc
,
wei_block_desc
.
Get1dIndex
(
k_thread_
data
_begin
,
c_thread_data_begin
,
0
,
0
),
wei_thread_desc
,
p_wei_thread
);
// copy output tensor into register
threadwise_4d_tensor_copy
(
out_thread_src_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
),
out_thread_dst_desc
,
p_out_thread
);
// threadwise convolution
threadwise_direct_convolution
(
in_thread_
dst_
desc
,
threadwise_direct_convolution
(
in_thread_desc
,
p_in_thread
,
wei_thread_
dst_
desc
,
wei_thread_desc
,
p_wei_thread
,
out_thread_
dst_
desc
,
out_thread_desc
,
p_out_thread
);
}
// accumulate output tensor
into LDS
threadwise_4d_tensor_copy
(
out_thread_
dst_
desc
,
// copy output
into LDS
threadwise_4d_tensor_copy
(
out_thread_desc
,
p_out_thread
,
out_thread_src_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
k_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
));
}
out_thread_block_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
));
}
}
src/include/gridwise_direct_convolution_1.cuh
View file @
fee92fb6
...
...
@@ -14,10 +14,9 @@ template <class TFloat,
unsigned
CPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
NBlockOpLen0
,
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
BlockSize
,
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_1
(
InGlobalDesc
,
...
...
@@ -125,9 +124,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
p_out_block
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_global_desc
.
GetLength
(
I1
);
c_block_work_begin
+=
CPerBlock
,
__syncthreads
()
)
c_block_work_begin
+=
CPerBlock
)
{
// copy input tensor to LDS
blockwise_4d_tensor_copy
<
TFloat
,
decltype
(
in_block_src_desc
),
...
...
@@ -154,14 +152,19 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
__syncthreads
();
// blockwise convolution
blockwise_convolution
<
TFloat
,
blockwise_
direct_
convolution
<
TFloat
,
decltype
(
in_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
out_block_desc
),
OutTileSizeH
,
OutTileSizeW
,
NPerThread
,
KPerThread
,
CPerThread
,
BlockSize
>
(
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
__syncthreads
();
}
// copy output tensor from LDS to device mem
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
fee92fb6
...
...
@@ -108,16 +108,16 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
const
unsigned
y_block_work_id
=
itmp
/
XBlockWork
;
const
unsigned
x_block_work_id
=
itmp
-
y_block_work_id
*
XBlockWork
;
const
unsigned
n_block_data_
offset
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_
offset
=
k_block_work_id
*
KPerBlock
;
const
unsigned
y_block_data_
offset
=
y_block_work_id
*
YPerBlock
;
const
unsigned
x_block_data_
offset
=
x_block_work_id
*
XPerBlock
;
const
unsigned
n_block_data_
begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_
begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
y_block_data_
begin
=
y_block_work_id
*
YPerBlock
;
const
unsigned
x_block_data_
begin
=
x_block_work_id
*
XPerBlock
;
const
unsigned
ho_block_data_
offset
=
y_block_data_
offset
*
OutTileSizeH
;
const
unsigned
wo_block_data_
offset
=
x_block_data_
offset
*
OutTileSizeW
;
const
unsigned
ho_block_data_
begin
=
y_block_data_
begin
*
OutTileSizeH
;
const
unsigned
wo_block_data_
begin
=
x_block_data_
begin
*
OutTileSizeW
;
const
unsigned
hi_block_data_
offset
=
ho_block_data_
offset
;
// minus padding
const
unsigned
wi_block_data_
offset
=
wo_block_data_
offset
;
// minus padding
const
unsigned
hi_block_data_
begin
=
ho_block_data_
begin
;
// minus padding
const
unsigned
wi_block_data_
begin
=
wo_block_data_
begin
;
// minus padding
// divide thread work
constexpr
unsigned
NThreadWork
=
(
NPerBlock
+
NPerThread
-
1
)
/
NPerThread
;
...
...
@@ -135,13 +135,13 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
const
unsigned
y_thread_work_id
=
itmp
/
XThreadWork
;
const
unsigned
x_thread_work_id
=
itmp
-
y_thread_work_id
*
XThreadWork
;
const
unsigned
n_thread_data_
offset
=
n_thread_work_id
*
NPerThread
;
const
unsigned
k_thread_data_
offset
=
k_thread_work_id
*
KPerThread
;
const
unsigned
ho_thread_data_
offset
=
y_thread_work_id
*
OutTileSizeH
;
const
unsigned
wo_thread_data_
offset
=
x_thread_work_id
*
OutTileSizeW
;
const
unsigned
n_thread_data_
begin
=
n_thread_work_id
*
NPerThread
;
const
unsigned
k_thread_data_
begin
=
k_thread_work_id
*
KPerThread
;
const
unsigned
ho_thread_data_
begin
=
y_thread_work_id
*
OutTileSizeH
;
const
unsigned
wo_thread_data_
begin
=
x_thread_work_id
*
OutTileSizeW
;
const
unsigned
hi_thread_data_
offset
=
ho_thread_data_
offset
;
const
unsigned
wi_thread_data_
offset
=
wo_thread_data_
offset
;
const
unsigned
hi_thread_data_
begin
=
ho_thread_data_
begin
;
const
unsigned
wi_thread_data_
begin
=
wo_thread_data_
begin
;
#if 0
if(threadIdx.x == 0)
...
...
@@ -152,20 +152,20 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
}
printf("threadIdx.x %u \t"
"n_thread_data_
offset
%u, k_thread_data_
offset
%u, ho_thread_data_
offset
%u, "
"wo_thread_data_
offset
%u\n",
"n_thread_data_
begin
%u, k_thread_data_
begin
%u, ho_thread_data_
begin
%u, "
"wo_thread_data_
begin
%u\n",
threadIdx.x,
n_thread_data_
offset
,
k_thread_data_
offset
,
ho_thread_data_
offset
,
wo_thread_data_
offset
);
n_thread_data_
begin
,
k_thread_data_
begin
,
ho_thread_data_
begin
,
wo_thread_data_
begin
);
#endif
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_
offset
=
0
;
c_block_data_
offset
<
in_global_desc
.
GetLength
(
I1
);
c_block_data_
offset
+=
CPerBlock
,
__syncthreads
())
for
(
unsigned
c_block_data_
begin
=
0
;
c_block_data_
begin
<
in_global_desc
.
GetLength
(
I1
);
c_block_data_
begin
+=
CPerBlock
,
__syncthreads
())
{
// copy input tensor to LDS
blockwise_4d_tensor_copy
<
TFloat
,
...
...
@@ -173,10 +173,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
decltype
(
in_block_desc
),
BlockSize
>
(
in_block_global_desc
,
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_
offset
,
c_block_data_
offset
,
hi_block_data_
offset
,
wi_block_data_
offset
),
in_global_desc
.
Get1dIndex
(
n_block_data_
begin
,
c_block_data_
begin
,
hi_block_data_
begin
,
wi_block_data_
begin
),
in_block_desc
,
p_in_block
);
...
...
@@ -186,8 +186,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
decltype
(
wei_block_desc
),
BlockSize
>
(
wei_block_global_desc
,
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_offset
,
c_block_data_offset
,
0
,
0
),
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_block_desc
,
p_wei_block
);
...
...
@@ -197,25 +196,22 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
{
// copy input tensor into register
threadwise_4d_tensor_copy
(
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_
offset
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_
begin
,
c_thread_data
,
hi_thread_data_
offset
,
wi_thread_data_
offset
),
hi_thread_data_
begin
,
wi_thread_data_
begin
),
in_thread_desc
,
p_in_thread
);
// copy weight tensor into register
threadwise_4d_tensor_copy
(
wei_thread_block_desc
,
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_
offset
,
c_thread_data
,
0
,
0
),
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_
begin
,
c_thread_data
,
0
,
0
),
wei_thread_desc
,
p_wei_thread
);
// threadwise convolution
threadwise_direct_convolution
<
TFloat
,
decltype
(
in_thread_desc
),
decltype
(
wei_thread_desc
),
decltype
(
out_thread_desc
)
>
(
in_thread_desc
,
threadwise_direct_convolution
(
in_thread_desc
,
p_in_thread
,
wei_thread_desc
,
p_wei_thread
,
...
...
@@ -229,8 +225,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
out_thread_desc
,
p_out_thread
,
out_thread_global_desc
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_
offset
+
n_thread_data_
offset
,
k_block_data_
offset
+
k_thread_data_
offset
,
ho_block_data_
offset
+
ho_thread_data_
offset
,
wo_block_data_
offset
+
wo_thread_data_
offset
));
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_
begin
+
n_thread_data_
begin
,
k_block_data_
begin
+
k_thread_data_
begin
,
ho_block_data_
begin
+
ho_thread_data_
begin
,
wo_block_data_
begin
+
wo_thread_data_
begin
));
}
src/include/gridwise_winograd_convolution.cuh
0 → 100644
View file @
fee92fb6
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "blockwise_winograd_transform.cuh"
#include "threadwise_winograd_transform.cuh"
template
<
class
TFloat
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
BlockSize
,
unsigned
GridSize
>
__global__
void
gridwise_winograd_convolution
(
InGlobalDesc
,
TFloat
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
TFloat
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
TFloat
*
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
S
=
wei_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
HoPerBlock
=
OutTileSizeH
*
YPerBlock
;
constexpr
unsigned
WoPerBlock
=
OutTileSizeW
*
XPerBlock
;
constexpr
unsigned
HiPerBlock
=
YPerBlock
*
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
XPerBlock
*
OutTileSizeW
+
R
-
1
;
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
InTileSizeW
=
OutTileSizeW
+
R
-
1
;
// divide block work
constexpr
unsigned
NBlockWork
=
(
out_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
YBlockWork
=
(
out_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
XBlockWork
=
(
out_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
const
unsigned
block_id
=
blockIdx
.
x
;
unsigned
itmp
=
block_id
;
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
YBlockWork
*
XBlockWork
);
itmp
-=
k_block_work_id
*
(
YBlockWork
*
XBlockWork
);
const
unsigned
y_block_work_id
=
itmp
/
XBlockWork
;
const
unsigned
x_block_work_id
=
itmp
-
y_block_work_id
*
XBlockWork
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
y_block_data_begin
=
y_block_work_id
*
YPerBlock
;
const
unsigned
x_block_data_begin
=
x_block_work_id
*
XPerBlock
;
const
unsigned
ho_block_data_begin
=
y_block_data_begin
*
OutTileSizeH
;
const
unsigned
wo_block_data_begin
=
x_block_data_begin
*
OutTileSizeW
;
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
// minus padding
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// minus padding
// divide thread work
constexpr
unsigned
NThreadWork
=
(
NPerBlock
+
NPerThread
-
1
)
/
NPerThread
;
constexpr
unsigned
KThreadWork
=
(
KPerBlock
+
KPerThread
-
1
)
/
KPerThread
;
constexpr
unsigned
YThreadWork
=
YPerBlock
;
constexpr
unsigned
XThreadWork
=
XPerBlock
;
const
unsigned
thread_id
=
threadIdx
.
x
;
itmp
=
thread_id
;
const
unsigned
n_thread_work_id
=
itmp
/
(
KThreadWork
*
YThreadWork
*
XThreadWork
);
itmp
-=
n_thread_work_id
*
(
KThreadWork
*
YThreadWork
*
XThreadWork
);
const
unsigned
k_thread_work_id
=
itmp
/
(
YThreadWork
*
XThreadWork
);
itmp
-=
k_thread_work_id
*
(
YThreadWork
*
XThreadWork
);
const
unsigned
y_thread_work_id
=
itmp
/
XThreadWork
;
const
unsigned
x_thread_work_id
=
itmp
-
y_thread_work_id
*
XThreadWork
;
const
unsigned
n_thread_data_begin
=
n_thread_work_id
*
NPerThread
;
const
unsigned
k_thread_data_begin
=
k_thread_work_id
*
KPerThread
;
const
unsigned
y_thread_data_begin
=
y_thread_work_id
;
const
unsigned
x_thread_data_begin
=
x_thread_work_id
;
// block data
constexpr
auto
in_transform_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
YPerBlock
*
InTileSizeH
,
XPerBlock
*
InTileSizeW
>
{});
constexpr
auto
wei_transform_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
InTileSizeH
,
InTileSizeW
>
{});
constexpr
unsigned
in_transform_block_size
=
in_transform_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_transform_block_size
=
wei_transform_block_desc
.
GetElementSpace
();
__shared__
TFloat
p_in_transform_block
[
in_transform_block_size
];
__shared__
TFloat
p_wei_transform_block
[
wei_transform_block_size
];
// thread data
constexpr
auto
in_transform_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
InTileSizeH
,
InTileSizeW
>
{},
in_transform_block_desc
.
GetStrides
());
constexpr
auto
wei_transform_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
InTileSizeH
,
InTileSizeW
>
{},
wei_transform_block_desc
.
GetStrides
());
constexpr
auto
out_transform_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
KPerThread
,
InTileSizeH
,
InTileSizeW
>
{});
constexpr
auto
out_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
KPerThread
,
OutTileSizeH
,
OutTileSizeW
>
{});
constexpr
auto
out_thread_global_desc
=
make_ConstantTensorDescriptor
(
out_thread_desc
.
GetLengths
(),
out_global_desc
.
GetStrides
());
constexpr
unsigned
out_transform_thread_size
=
out_transform_thread_desc
.
GetElementSpace
();
constexpr
unsigned
out_thread_size
=
out_thread_desc
.
GetElementSpace
();
TFloat
p_out_transform_thread
[
out_transform_thread_size
];
TFloat
p_out_thread
[
out_thread_size
];
#if 0
if(blockIdx.x == 0 && threadIdx.x == 0)
{
printf("in_transform_block_size %u, wei_transform_block_size %u, out_transform_thread_size "
"%u, out_thread_size %u \n",
in_transform_block_size,
wei_transform_block_size,
out_transform_thread_size,
out_thread_size);
}
#endif
// set threadwise output transform tensor to 0
threadwise_4d_tensor_set_zero
(
out_transform_thread_desc
,
p_out_transform_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
in_global_desc
.
GetLength
(
I1
);
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
#if 0
// blockwise transform input
blockwise_winograd_transform_input<TFloat,
InTileSizeH,
InTileSizeW,
S,
R,
OutTileSizeH,
OutTileSizeW,
NPerBlock,
CPerBlock,
YPerBlock,
XPerBlock,
BlockSize>(
p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
p_in_transform_block);
#endif
// blockwise transform weights
blockwise_winograd_transform_weight
<
TFloat
,
InTileSizeH
,
InTileSizeW
,
S
,
R
,
OutTileSizeH
,
OutTileSizeW
,
KPerBlock
,
CPerBlock
,
BlockSize
>
(
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_transform_block
);
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
// threadwise point multiplication
threadwise_winograd_calculate_transformed_output
<
TFloat
,
decltype
(
in_transform_thread_block_desc
),
decltype
(
wei_transform_thread_block_desc
),
decltype
(
out_transform_thread_desc
),
InTileSizeH
,
InTileSizeW
,
S
,
R
,
OutTileSizeH
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
y_thread_data_begin
*
InTileSizeH
,
x_thread_data_begin
*
InTileSizeW
),
wei_transform_thread_block_desc
,
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_transform_thread_desc
,
p_out_transform_thread
);
}
};
// transform back
threadwise_winograd_reverse_transform_output
<
TFloat
,
decltype
(
out_transform_thread_desc
),
decltype
(
out_thread_desc
),
InTileSizeH
,
InTileSizeW
,
S
,
R
,
OutTileSizeH
,
OutTileSizeW
>
(
out_transform_thread_desc
,
p_out_transform_thread
,
out_thread_desc
,
p_out_thread
);
// copy output tensor from register to global mem
threadwise_4d_tensor_copy
(
out_thread_desc
,
p_out_thread
,
out_thread_global_desc
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
y_thread_data_begin
*
OutTileSizeH
,
wo_block_data_begin
+
x_thread_data_begin
*
OutTileSizeW
));
}
\ No newline at end of file
src/include/threadwise_direct_convolution.cuh
View file @
fee92fb6
...
...
@@ -19,11 +19,11 @@ __device__ void threadwise_direct_convolution(InDesc,
constexpr
auto
out_desc
=
OutDesc
{};
#if 0
if(threadIdx.x == 0)
if(
blockIdx.x == 0 &&
threadIdx.x == 0)
{
print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: ");
print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: ");
print_ConstantTensorDescriptor(out_desc, "threadwise_direct_convolution: ");
print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution:
in_desc:
");
print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution:
wei_desc:
");
print_ConstantTensorDescriptor(out_desc, "threadwise_direct_convolution:
out_desc:
");
}
#endif
...
...
src/include/threadwise_tensor_op.cuh
View file @
fee92fb6
...
...
@@ -2,7 +2,7 @@
#include "constant_tensor_descriptor.cuh"
template
<
class
TFloat
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_op_unary
(
Desc
,
TFloat
*
__restrict__
p
_dst
,
F
f
)
__device__
void
threadwise_4d_tensor_pointwise_op_unary
(
Desc
,
TFloat
*
__restrict__
p
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -28,7 +28,7 @@ __device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict
{
const
unsigned
dindex
=
desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p
_dst
[
dindex
]);
f
(
p
[
dindex
]);
}
}
}
...
...
@@ -77,12 +77,12 @@ __device__ void threadwise_4d_tensor_pointwise_op_binary(
}
template
<
class
TFloat
,
class
Desc
>
__device__
void
threadwise_4d_tensor_set_zero
(
Desc
,
TFloat
*
__restrict__
p
_dst
)
__device__
void
threadwise_4d_tensor_set_zero
(
Desc
,
TFloat
*
__restrict__
p
)
{
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
threadwise_4d_tensor_pointwise_op_unary
<
TFloat
,
Desc
,
decltype
(
f_set_zero
)
>
(
Desc
{},
p
_dst
,
f_set_zero
);
Desc
{},
p
,
f_set_zero
);
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
>
...
...
@@ -96,3 +96,48 @@ __device__ void threadwise_4d_tensor_copy(SrcDesc,
threadwise_4d_tensor_pointwise_op_binary
<
TFloat
,
SrcDesc
,
DstDesc
,
decltype
(
f_copy
)
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
f_copy
);
}
template
<
class
TFloat
,
class
Desc
,
class
IDim
>
__device__
void
threadwise_4d_tensor_shift_down
(
Desc
,
TFloat
*
__restrict__
p
,
IDim
,
unsigned
shift
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
Desc
{};
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: ");
}
#endif
const
unsigned
did0_end
=
is_same
<
decltype
(
I0
),
IDim
>::
value
?
desc
.
GetLength
(
I0
)
-
shift
:
desc
.
GetLength
(
I0
);
const
unsigned
did1_end
=
is_same
<
decltype
(
I1
),
IDim
>::
value
?
desc
.
GetLength
(
I1
)
-
shift
:
desc
.
GetLength
(
I1
);
const
unsigned
did2_end
=
is_same
<
decltype
(
I2
),
IDim
>::
value
?
desc
.
GetLength
(
I2
)
-
shift
:
desc
.
GetLength
(
I2
);
const
unsigned
did3_end
=
is_same
<
decltype
(
I3
),
IDim
>::
value
?
desc
.
GetLength
(
I3
)
-
shift
:
desc
.
GetLength
(
I3
);
for
(
unsigned
did0
=
0
;
did0
<
did0_end
;
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
did1_end
;
++
did1
)
{
for
(
unsigned
did2
=
0
;
did2
<
did2_end
;
++
did2
)
{
for
(
unsigned
did3
=
0
;
did3
<
did3_end
;
++
did3
)
{
const
unsigned
dindex
=
desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
sindex
=
dindex
+
shift
*
desc
.
GetStride
(
IDim
{});
p
[
dindex
]
=
p
[
sindex
];
}
}
}
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment