Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
0b8e67ef
Commit
0b8e67ef
authored
Jan 08, 2019
by
Chao Liu
Browse files
refactor
parent
ac1f62be
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
81 additions
and
89 deletions
+81
-89
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+5
-5
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+14
-18
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+8
-8
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+7
-7
src/include/gridwise_winograd_convolution.cuh
src/include/gridwise_winograd_convolution.cuh
+12
-12
src/include/threadwise_direct_convolution.cuh
src/include/threadwise_direct_convolution.cuh
+16
-16
src/include/threadwise_tensor_op.cuh
src/include/threadwise_tensor_op.cuh
+19
-23
No files found.
src/include/blockwise_direct_convolution.cuh
View file @
0b8e67ef
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "threadwise_direct_convolution.cuh"
#include "threadwise_direct_convolution.cuh"
template
<
unsigned
BlockSize
,
template
<
unsigned
BlockSize
,
class
T
Float
,
class
Float
,
class
InBlockDesc
,
class
InBlockDesc
,
class
WeiBlockDesc
,
class
WeiBlockDesc
,
class
OutBlockDesc
,
class
OutBlockDesc
,
...
@@ -14,11 +14,11 @@ template <unsigned BlockSize,
...
@@ -14,11 +14,11 @@ template <unsigned BlockSize,
unsigned
KPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
>
unsigned
CPerThread
>
__device__
void
blockwise_direct_convolution
(
InBlockDesc
,
__device__
void
blockwise_direct_convolution
(
InBlockDesc
,
T
Float
*
const
__restrict__
p_in_block
,
Float
*
const
__restrict__
p_in_block
,
WeiBlockDesc
,
WeiBlockDesc
,
T
Float
*
const
__restrict__
p_wei_block
,
Float
*
const
__restrict__
p_wei_block
,
OutBlockDesc
,
OutBlockDesc
,
T
Float
*
__restrict__
p_out_block
)
Float
*
__restrict__
p_out_block
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
...
@@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
unsigned
hi_thread_data_begin
=
ho_thread_data_begin
;
// minus padding
unsigned
hi_thread_data_begin
=
ho_thread_data_begin
;
// minus padding
unsigned
wi_thread_data_begin
=
wo_thread_data_begin
;
// minus padding
unsigned
wi_thread_data_begin
=
wo_thread_data_begin
;
// minus padding
T
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
threadwise_4d_tensor_copy
(
out_block_desc
,
threadwise_4d_tensor_copy
(
out_block_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
...
...
src/include/blockwise_tensor_op.cuh
View file @
0b8e67ef
#pragma once
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
template
<
unsigned
BlockSize
,
class
T
Float
,
class
DstDesc
,
class
F
>
template
<
unsigned
BlockSize
,
class
Float
,
class
DstDesc
,
class
F
>
__device__
void
__device__
void
blockwise_4d_tensor_pointwise_operation_unary
(
DstDesc
,
T
Float
*
__restrict__
p_dst
,
F
f
)
blockwise_4d_tensor_pointwise_operation_unary
(
DstDesc
,
Float
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
...
@@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
// TODO: in order to optimize mem access for different mem type,
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
// need to write specialized version
template
<
unsigned
BlockSize
,
template
<
unsigned
BlockSize
,
class
T
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
RefDesc
,
class
RefDesc
,
...
@@ -87,9 +87,9 @@ template <unsigned BlockSize,
...
@@ -87,9 +87,9 @@ template <unsigned BlockSize,
class
F
>
class
F
>
__device__
void
__device__
void
blockwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
blockwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
T
Float
*
const
__restrict__
p_src
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
DstDesc
,
T
Float
*
__restrict__
p_dst
,
Float
*
__restrict__
p_dst
,
RefDesc
,
RefDesc
,
Reorder
,
Reorder
,
F
f
)
F
f
)
...
@@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
}
}
}
}
template
<
unsigned
BlockSize
,
class
T
Float
,
class
DstDesc
>
template
<
unsigned
BlockSize
,
class
Float
,
class
DstDesc
>
__device__
void
blockwise_4d_tensor_set_zero
(
DstDesc
,
T
Float
*
__restrict__
p_dst
)
__device__
void
blockwise_4d_tensor_set_zero
(
DstDesc
,
Float
*
__restrict__
p_dst
)
{
{
auto
f_set_zero
=
[](
T
Float
&
v
)
{
v
=
T
Float
(
0
);
};
auto
f_set_zero
=
[](
Float
&
v
)
{
v
=
Float
(
0
);
};
blockwise_4d_tensor_pointwise_operation_unary
<
BlockSize
>
(
DstDesc
{},
p_dst
,
f_set_zero
);
blockwise_4d_tensor_pointwise_operation_unary
<
BlockSize
>
(
DstDesc
{},
p_dst
,
f_set_zero
);
}
}
template
<
unsigned
BlockSize
,
template
<
unsigned
BlockSize
,
class
T
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
RefDesc
,
class
RefDesc
,
class
Reorder
>
class
Reorder
>
__device__
void
blockwise_4d_tensor_copy_reorder
(
SrcDesc
,
__device__
void
blockwise_4d_tensor_copy_reorder
(
TFloat
*
const
__restrict__
p_src
,
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
{
{
auto
f_copy
=
[](
const
T
Float
&
src
,
T
Float
&
dst
)
{
dst
=
src
;
};
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
blockwise_4d_tensor_pointwise_operation_binary_reorder
<
BlockSize
>
(
blockwise_4d_tensor_pointwise_operation_binary_reorder
<
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
}
}
template
<
unsigned
BlockSize
,
class
T
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
__device__
void
blockwise_4d_tensor_copy
(
__device__
void
blockwise_4d_tensor_copy
(
SrcDesc
,
T
Float
*
const
__restrict__
p_src
,
DstDesc
,
T
Float
*
__restrict__
p_dst
,
RefDesc
)
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
)
{
{
constexpr
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
constexpr
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
...
...
src/include/gridwise_direct_convolution_1.cuh
View file @
0b8e67ef
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "blockwise_tensor_op.cuh"
#include "blockwise_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
#include "blockwise_direct_convolution.cuh"
template
<
class
T
Float
,
template
<
class
Float
,
class
InGlobalDesc
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
class
OutGlobalDesc
,
...
@@ -20,11 +20,11 @@ template <class TFloat,
...
@@ -20,11 +20,11 @@ template <class TFloat,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_1
(
InGlobalDesc
,
__global__
void
gridwise_direct_convolution_1
(
InGlobalDesc
,
T
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
WeiGlobalDesc
,
T
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
OutGlobalDesc
,
T
Float
*
__restrict__
p_out_global
)
Float
*
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
constexpr
unsigned
wei_block_size
=
wei_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_block_desc
.
GetElementSpace
();
constexpr
unsigned
out_block_size
=
out_block_desc
.
GetElementSpace
();
constexpr
unsigned
out_block_size
=
out_block_desc
.
GetElementSpace
();
__shared__
T
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
T
Float
p_wei_block
[
wei_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
__shared__
T
Float
p_out_block
[
out_block_size
];
__shared__
Float
p_out_block
[
out_block_size
];
const
unsigned
block_id
=
blockIdx
.
x
;
const
unsigned
block_id
=
blockIdx
.
x
;
...
@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
// blockwise convolution
// blockwise convolution
blockwise_direct_convolution
<
BlockSize
,
blockwise_direct_convolution
<
BlockSize
,
T
Float
,
Float
,
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
out_block_desc
),
decltype
(
out_block_desc
),
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
0b8e67ef
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "threadwise_tensor_op.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
#include "threadwise_direct_convolution.cuh"
template
<
class
T
Float
,
template
<
class
Float
,
class
InGlobalDesc
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
class
OutGlobalDesc
,
...
@@ -22,11 +22,11 @@ template <class TFloat,
...
@@ -22,11 +22,11 @@ template <class TFloat,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2
(
InGlobalDesc
,
__global__
void
gridwise_direct_convolution_2
(
InGlobalDesc
,
T
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
WeiGlobalDesc
,
T
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
OutGlobalDesc
,
T
Float
*
__restrict__
p_out_global
)
Float
*
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
constexpr
unsigned
in_block_size
=
in_block_desc
.
GetElementSpace
();
constexpr
unsigned
in_block_size
=
in_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_block_desc
.
GetElementSpace
();
__shared__
T
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
T
Float
p_wei_block
[
wei_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
// threadwise tensors
// threadwise tensors
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
...
@@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
get_convolution_output_4d_tensor_descriptor
(
in_thread_block_desc
,
wei_thread_block_desc
);
get_convolution_output_4d_tensor_descriptor
(
in_thread_block_desc
,
wei_thread_block_desc
);
// register
// register
T
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
// divide block work
// divide block work
constexpr
unsigned
NBlockWork
=
(
out_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
NBlockWork
=
(
out_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
...
...
src/include/gridwise_winograd_convolution.cuh
View file @
0b8e67ef
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "blockwise_winograd_transform.cuh"
#include "blockwise_winograd_transform.cuh"
#include "threadwise_winograd_transform.cuh"
#include "threadwise_winograd_transform.cuh"
template
<
class
T
Float
,
template
<
class
Float
,
class
InGlobalDesc
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
class
OutGlobalDesc
,
...
@@ -20,11 +20,11 @@ template <class TFloat,
...
@@ -20,11 +20,11 @@ template <class TFloat,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_winograd_convolution
(
InGlobalDesc
,
__global__
void
gridwise_winograd_convolution
(
InGlobalDesc
,
T
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
WeiGlobalDesc
,
T
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
OutGlobalDesc
,
T
Float
*
__restrict__
p_out_global
)
Float
*
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
...
@@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
constexpr
auto
wei_transform_block_desc
=
constexpr
auto
wei_transform_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
InTileSizeH
,
InTileSizeW
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
InTileSizeH
,
InTileSizeW
>
{});
__shared__
T
Float
p_in_transform_block
[
in_transform_block_desc
.
GetElementSpace
()];
__shared__
Float
p_in_transform_block
[
in_transform_block_desc
.
GetElementSpace
()];
__shared__
T
Float
p_wei_transform_block
[
wei_transform_block_desc
.
GetElementSpace
()];
__shared__
Float
p_wei_transform_block
[
wei_transform_block_desc
.
GetElementSpace
()];
// thread data
// thread data
constexpr
auto
in_transform_thread_block_desc
=
constexpr
auto
in_transform_thread_block_desc
=
...
@@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
...
@@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
constexpr
auto
out_thread_global_desc
=
constexpr
auto
out_thread_global_desc
=
make_ConstantTensorDescriptor
(
out_thread_desc
.
GetLengths
(),
out_global_desc
.
GetStrides
());
make_ConstantTensorDescriptor
(
out_thread_desc
.
GetLengths
(),
out_global_desc
.
GetStrides
());
T
Float
p_out_transform_thread
[
out_transform_thread_desc
.
GetElementSpace
()];
Float
p_out_transform_thread
[
out_transform_thread_desc
.
GetElementSpace
()];
T
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
#if 0
#if 0
if(blockIdx.x == 0 && threadIdx.x == 0)
if(blockIdx.x == 0 && threadIdx.x == 0)
...
@@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
...
@@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
{
{
#if 0
#if 0
// blockwise transform input
// blockwise transform input
blockwise_winograd_transform_input<
T
Float,
blockwise_winograd_transform_input<Float,
InTileSizeH,
InTileSizeH,
InTileSizeW,
InTileSizeW,
S,
S,
...
@@ -166,7 +166,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
...
@@ -166,7 +166,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
#endif
#endif
// blockwise transform weights
// blockwise transform weights
blockwise_winograd_transform_weight
<
T
Float
,
blockwise_winograd_transform_weight
<
Float
,
InTileSizeH
,
InTileSizeH
,
InTileSizeW
,
InTileSizeW
,
S
,
S
,
...
@@ -183,7 +183,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
...
@@ -183,7 +183,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
{
{
// threadwise point multiplication
// threadwise point multiplication
threadwise_winograd_calculate_transformed_output
<
threadwise_winograd_calculate_transformed_output
<
T
Float
,
Float
,
decltype
(
in_transform_thread_block_desc
),
decltype
(
in_transform_thread_block_desc
),
decltype
(
wei_transform_thread_block_desc
),
decltype
(
wei_transform_thread_block_desc
),
decltype
(
out_transform_thread_desc
),
decltype
(
out_transform_thread_desc
),
...
@@ -207,7 +207,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
...
@@ -207,7 +207,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
};
};
// transform back
// transform back
threadwise_winograd_reverse_transform_output
<
T
Float
,
threadwise_winograd_reverse_transform_output
<
Float
,
decltype
(
out_transform_thread_desc
),
decltype
(
out_transform_thread_desc
),
decltype
(
out_thread_desc
),
decltype
(
out_thread_desc
),
InTileSizeH
,
InTileSizeH
,
...
...
src/include/threadwise_direct_convolution.cuh
View file @
0b8e67ef
...
@@ -2,13 +2,13 @@
...
@@ -2,13 +2,13 @@
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
// optimized for scenario if p_in, p_wei, p_out are in register
// optimized for scenario if p_in, p_wei, p_out are in register
template
<
class
T
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
__device__
void
threadwise_direct_convolution_1
(
InDesc
,
__device__
void
threadwise_direct_convolution_1
(
InDesc
,
T
Float
*
const
__restrict__
p_in
,
Float
*
const
__restrict__
p_in
,
WeiDesc
,
WeiDesc
,
T
Float
*
const
__restrict__
p_wei
,
Float
*
const
__restrict__
p_wei
,
OutDesc
,
OutDesc
,
T
Float
*
__restrict__
p_out
)
Float
*
__restrict__
p_out
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
...
@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Copy in and wei into register before doing convolution
// Copy in and wei into register before doing convolution
template
<
class
T
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
__device__
void
threadwise_direct_convolution_2
(
InDesc
,
__device__
void
threadwise_direct_convolution_2
(
InDesc
,
T
Float
*
const
__restrict__
p_in
,
Float
*
const
__restrict__
p_in
,
WeiDesc
,
WeiDesc
,
T
Float
*
const
__restrict__
p_wei
,
Float
*
const
__restrict__
p_wei
,
OutDesc
,
OutDesc
,
T
Float
*
__restrict__
p_out
)
Float
*
__restrict__
p_out
)
{
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
...
@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
...
@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
constexpr
auto
wei_reg_desc
=
make_ConstantTensorDescriptor
(
wei_desc
.
GetLengths
());
constexpr
auto
wei_reg_desc
=
make_ConstantTensorDescriptor
(
wei_desc
.
GetLengths
());
// register
// register
T
Float
p_in_reg
[
in_reg_desc
.
GetElementSpace
()];
Float
p_in_reg
[
in_reg_desc
.
GetElementSpace
()];
T
Float
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
Float
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
// copy input tensor into register
// copy input tensor into register
threadwise_4d_tensor_copy
(
in_desc
,
p_in
,
in_reg_desc
,
p_in_reg
,
in_reg_desc
);
threadwise_4d_tensor_copy
(
in_desc
,
p_in
,
in_reg_desc
,
p_in_reg
,
in_reg_desc
);
...
@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
...
@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// load 1x1 weight into register, and do 1x1 convolution in register.
// load 1x1 weight into register, and do 1x1 convolution in register.
template
<
class
T
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
__device__
void
threadwise_direct_convolution_3
(
InDesc
,
__device__
void
threadwise_direct_convolution_3
(
InDesc
,
T
Float
*
const
__restrict__
p_in
,
Float
*
const
__restrict__
p_in
,
WeiDesc
,
WeiDesc
,
T
Float
*
const
__restrict__
p_wei
,
Float
*
const
__restrict__
p_wei
,
OutDesc
,
OutDesc
,
T
Float
*
__restrict__
p_out
)
Float
*
__restrict__
p_out
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
constexpr
auto
wei_reg_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
wei_reg_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
wei_desc
.
GetLength
(
I0
),
wei_desc
.
GetLength
(
I1
),
1
,
1
>
{});
Sequence
<
wei_desc
.
GetLength
(
I0
),
wei_desc
.
GetLength
(
I1
),
1
,
1
>
{});
T
Float
p_in_reg
[
in_reg_desc
.
GetElementSpace
()];
Float
p_in_reg
[
in_reg_desc
.
GetElementSpace
()];
T
Float
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
Float
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
constexpr
unsigned
in_w_new_read
=
1
;
constexpr
unsigned
in_w_new_read
=
1
;
...
...
src/include/threadwise_tensor_op.cuh
View file @
0b8e67ef
#pragma once
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
template
<
class
T
Float
,
class
Desc
,
class
F
>
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_unary
(
Desc
,
T
Float
*
__restrict__
p
,
F
f
)
__device__
void
threadwise_4d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r
...
@@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r
// TODO: in order to optimize mem access for different mem type,
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
// need to write specialized version
template
<
class
T
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
,
class
F
>
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
,
class
F
>
__device__
void
__device__
void
threadwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
threadwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
T
Float
*
const
__restrict__
p_src
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
DstDesc
,
T
Float
*
__restrict__
p_dst
,
Float
*
__restrict__
p_dst
,
RefDesc
,
RefDesc
,
Reorder
,
Reorder
,
F
f
)
F
f
)
...
@@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
}
}
}
}
template
<
class
T
Float
,
class
Desc
>
template
<
class
Float
,
class
Desc
>
__device__
void
threadwise_4d_tensor_set_zero
(
Desc
,
T
Float
*
__restrict__
p
)
__device__
void
threadwise_4d_tensor_set_zero
(
Desc
,
Float
*
__restrict__
p
)
{
{
auto
f_set_zero
=
[](
T
Float
&
v
)
{
v
=
T
Float
(
0
);
};
auto
f_set_zero
=
[](
Float
&
v
)
{
v
=
Float
(
0
);
};
threadwise_4d_tensor_pointwise_operation_unary
<
T
Float
,
Desc
,
decltype
(
f_set_zero
)
>
(
threadwise_4d_tensor_pointwise_operation_unary
<
Float
,
Desc
,
decltype
(
f_set_zero
)
>
(
Desc
{},
p
,
f_set_zero
);
Desc
{},
p
,
f_set_zero
);
}
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
>
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
>
__device__
void
threadwise_4d_tensor_copy_reorder
(
SrcDesc
,
__device__
void
threadwise_4d_tensor_copy_reorder
(
TFloat
*
const
__restrict__
p_src
,
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
{
{
auto
f_copy
=
[](
const
T
Float
&
src
,
T
Float
&
dst
)
{
dst
=
src
;
};
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
threadwise_4d_tensor_pointwise_operation_binary_reorder
<
T
Float
,
threadwise_4d_tensor_pointwise_operation_binary_reorder
<
Float
,
SrcDesc
,
SrcDesc
,
DstDesc
,
DstDesc
,
RefDesc
,
RefDesc
,
...
@@ -111,18 +107,18 @@ __device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
...
@@ -111,18 +107,18 @@ __device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
}
}
template
<
class
T
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
__device__
void
threadwise_4d_tensor_copy
(
__device__
void
threadwise_4d_tensor_copy
(
SrcDesc
,
T
Float
*
const
__restrict__
p_src
,
DstDesc
,
T
Float
*
__restrict__
p_dst
,
RefDesc
)
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
)
{
{
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
threadwise_4d_tensor_copy_reorder
<
T
Float
,
SrcDesc
,
DstDesc
,
RefDesc
,
decltype
(
reorder
)
>
(
threadwise_4d_tensor_copy_reorder
<
Float
,
SrcDesc
,
DstDesc
,
RefDesc
,
decltype
(
reorder
)
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
}
}
template
<
class
T
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
template
<
class
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
__device__
void
threadwise_4d_tensor_shift_down
(
Desc
,
T
Float
*
__restrict__
p
,
IDim
,
NShift
)
__device__
void
threadwise_4d_tensor_shift_down
(
Desc
,
Float
*
__restrict__
p
,
IDim
,
NShift
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment