Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
ac1f62be
Commit
ac1f62be
authored
Jan 07, 2019
by
Chao Liu
Browse files
refactor
parent
3dbd4725
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
79 additions
and
89 deletions
+79
-89
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+3
-3
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+12
-31
src/include/constant_tensor_descriptor.cuh
src/include/constant_tensor_descriptor.cuh
+53
-23
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+8
-21
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+3
-11
No files found.
src/include/blockwise_direct_convolution.cuh
View file @
ac1f62be
...
@@ -3,7 +3,8 @@
...
@@ -3,7 +3,8 @@
#include "threadwise_tensor_op.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
#include "threadwise_direct_convolution.cuh"
template
<
class
TFloat
,
template
<
unsigned
BlockSize
,
class
TFloat
,
class
InBlockDesc
,
class
InBlockDesc
,
class
WeiBlockDesc
,
class
WeiBlockDesc
,
class
OutBlockDesc
,
class
OutBlockDesc
,
...
@@ -11,8 +12,7 @@ template <class TFloat,
...
@@ -11,8 +12,7 @@ template <class TFloat,
unsigned
OutTileSizeW
,
unsigned
OutTileSizeW
,
unsigned
NPerThread
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
CPerThread
>
unsigned
BlockSize
>
__device__
void
blockwise_direct_convolution
(
InBlockDesc
,
__device__
void
blockwise_direct_convolution
(
InBlockDesc
,
TFloat
*
const
__restrict__
p_in_block
,
TFloat
*
const
__restrict__
p_in_block
,
WeiBlockDesc
,
WeiBlockDesc
,
...
...
src/include/blockwise_tensor_op.cuh
View file @
ac1f62be
#pragma once
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
template
<
class
TFloat
,
class
DstDesc
,
class
F
,
unsigned
BlockSize
>
template
<
unsigned
BlockSize
,
class
TFloat
,
class
DstDesc
,
class
F
>
__device__
void
__device__
void
blockwise_4d_tensor_pointwise_operation_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
blockwise_4d_tensor_pointwise_operation_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
...
@@ -78,13 +78,13 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
...
@@ -78,13 +78,13 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
// TODO: in order to optimize mem access for different mem type,
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
// need to write specialized version
template
<
class
TFloat
,
template
<
unsigned
BlockSize
,
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
RefDesc
,
class
RefDesc
,
class
Reorder
,
class
Reorder
,
class
F
,
class
F
>
unsigned
BlockSize
>
__device__
void
__device__
void
blockwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
blockwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
TFloat
*
const
__restrict__
p_src
,
...
@@ -170,21 +170,20 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -170,21 +170,20 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
}
}
}
}
template
<
class
TFloat
,
class
DstDesc
,
unsigned
BlockSize
>
template
<
unsigned
BlockSize
,
class
TFloat
,
class
DstDesc
>
__device__
void
blockwise_4d_tensor_set_zero
(
DstDesc
,
TFloat
*
__restrict__
p_dst
)
__device__
void
blockwise_4d_tensor_set_zero
(
DstDesc
,
TFloat
*
__restrict__
p_dst
)
{
{
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
blockwise_4d_tensor_pointwise_operation_unary
<
TFloat
,
DstDesc
,
decltype
(
f_set_zero
),
BlockSize
>
(
blockwise_4d_tensor_pointwise_operation_unary
<
BlockSize
>
(
DstDesc
{},
p_dst
,
f_set_zero
);
DstDesc
{},
p_dst
,
f_set_zero
);
}
}
template
<
class
TFloat
,
template
<
unsigned
BlockSize
,
class
TFloat
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
RefDesc
,
class
RefDesc
,
class
Reorder
,
class
Reorder
>
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_copy_reorder
(
SrcDesc
,
__device__
void
blockwise_4d_tensor_copy_reorder
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
DstDesc
,
...
@@ -194,34 +193,16 @@ __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
...
@@ -194,34 +193,16 @@ __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
{
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
blockwise_4d_tensor_pointwise_operation_binary_reorder
<
TFloat
,
blockwise_4d_tensor_pointwise_operation_binary_reorder
<
BlockSize
>
(
SrcDesc
,
DstDesc
,
RefDesc
,
Reorder
,
decltype
(
f_copy
),
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
}
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
unsigned
BlockSize
>
template
<
unsigned
BlockSize
,
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
__device__
void
blockwise_4d_tensor_copy
(
__device__
void
blockwise_4d_tensor_copy
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
)
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
)
{
{
constexpr
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
constexpr
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
blockwise_4d_tensor_copy_reorder
<
TFloat
,
blockwise_4d_tensor_copy_reorder
<
BlockSize
>
(
SrcDesc
,
DstDesc
,
RefDesc
,
decltype
(
reorder
),
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
}
}
template
<
class
TFloat
,
class
ImDesc
,
class
WDesc
,
class
ColDesc
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_im2col
(
ImDesc
,
const
__restrict__
TFloat
*
p_im
,
WDesc
,
ColDesc
,
__restrict__
TFloat
*
p_col
)
{
// do nothing
}
src/include/constant_tensor_descriptor.cuh
View file @
ac1f62be
...
@@ -24,12 +24,55 @@ struct Sequence
...
@@ -24,12 +24,55 @@ struct Sequence
}
}
template
<
unsigned
I
>
template
<
unsigned
I
>
__host__
__device__
constexpr
auto
Get
Number
(
Number
<
I
>
)
const
__host__
__device__
constexpr
auto
Get
Constant
(
Number
<
I
>
)
const
{
{
constexpr
unsigned
N
=
Get
(
I
);
constexpr
unsigned
N
=
Get
(
I
);
return
Number
<
N
>
{};
return
Number
<
N
>
{};
}
}
template
<
unsigned
I0
,
unsigned
I1
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
)
const
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
return
Sequence
<
IR0
,
IR1
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
,
Number
<
I2
>
)
const
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
return
Sequence
<
IR0
,
IR1
,
IR2
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
,
Number
<
I2
>
,
Number
<
I3
>
)
const
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
constexpr
unsigned
IR3
=
Get
(
Number
<
I3
>
{});
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
,
unsigned
I4
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
,
Number
<
I2
>
,
Number
<
I3
>
,
Number
<
I4
>
)
const
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
constexpr
unsigned
IR3
=
Get
(
Number
<
I3
>
{});
constexpr
unsigned
IR4
=
Get
(
Number
<
I4
>
{});
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
,
IR4
>
{};
}
};
};
template
<
class
Lengths
,
class
Strides
>
template
<
class
Lengths
,
class
Strides
>
...
@@ -99,6 +142,15 @@ struct ConstantTensorDescriptor
...
@@ -99,6 +142,15 @@ struct ConstantTensorDescriptor
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
return
n
*
GetStride
(
I0
)
+
c
*
GetStride
(
I1
)
+
h
*
GetStride
(
I2
)
+
w
*
GetStride
(
I3
);
return
n
*
GetStride
(
I0
)
+
c
*
GetStride
(
I1
)
+
h
*
GetStride
(
I2
)
+
w
*
GetStride
(
I3
);
}
}
template
<
class
...
Is
>
__host__
__device__
constexpr
auto
Reorder
(
Is
...
is
)
const
{
constexpr
auto
lengths
=
Lengths
{}.
Reorder
(
is
...);
constexpr
auto
strides
=
Strides
{}.
Reorder
(
is
...);
return
ConstantTensorDescriptor
<
decltype
(
lengths
),
decltype
(
strides
)
>
{};
}
};
};
// this is ugly, only for 4d
// this is ugly, only for 4d
...
@@ -121,28 +173,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
...
@@ -121,28 +173,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
}
// this is ugly, only for 4d
template
<
class
Desc
,
class
Reorder
>
__host__
__device__
constexpr
auto
get_reordered_4d_tensor_descriptor
(
Desc
,
Reorder
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
IT0
=
Reorder
{}.
GetNumber
(
I0
);
constexpr
auto
IT1
=
Reorder
{}.
GetNumber
(
I1
);
constexpr
auto
IT2
=
Reorder
{}.
GetNumber
(
I2
);
constexpr
auto
IT3
=
Reorder
{}.
GetNumber
(
I3
);
constexpr
unsigned
L0
=
Desc
{}.
GetLength
(
IT0
);
constexpr
unsigned
L1
=
Desc
{}.
GetLength
(
IT1
);
constexpr
unsigned
L2
=
Desc
{}.
GetLength
(
IT2
);
constexpr
unsigned
L3
=
Desc
{}.
GetLength
(
IT3
);
return
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
{});
}
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
template
<
class
InDesc
,
class
WeiDesc
>
__host__
__device__
constexpr
auto
get_convolution_output_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
__host__
__device__
constexpr
auto
get_convolution_output_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
...
...
src/include/gridwise_direct_convolution_1.cuh
View file @
ac1f62be
...
@@ -122,18 +122,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -122,18 +122,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
#endif
#endif
// set output tensor in LDS to 0
// set output tensor in LDS to 0
blockwise_4d_tensor_set_zero
<
TFloat
,
decltype
(
out_block_desc
),
BlockSize
>
(
out_block_desc
,
blockwise_4d_tensor_set_zero
<
BlockSize
>
(
out_block_desc
,
p_out_block
);
p_out_block
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_global_desc
.
GetLength
(
I1
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_global_desc
.
GetLength
(
I1
);
c_block_work_begin
+=
CPerBlock
)
c_block_work_begin
+=
CPerBlock
)
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_4d_tensor_copy
<
TFloat
,
blockwise_4d_tensor_copy
<
BlockSize
>
(
in_block_global_desc
,
decltype
(
in_block_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
BlockSize
>
(
in_block_global_desc
,
p_in_global
+
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
c_block_work_begin
,
...
@@ -144,11 +139,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -144,11 +139,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
in_block_desc
);
in_block_desc
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_copy
<
TFloat
,
blockwise_4d_tensor_copy
<
BlockSize
>
(
decltype
(
wei_block_global_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
BlockSize
>
(
wei_block_global_desc
,
wei_block_global_desc
,
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
wei_block_desc
,
wei_block_desc
,
...
@@ -158,7 +149,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -158,7 +149,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
__syncthreads
();
__syncthreads
();
// blockwise convolution
// blockwise convolution
blockwise_direct_convolution
<
TFloat
,
blockwise_direct_convolution
<
BlockSize
,
TFloat
,
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
out_block_desc
),
decltype
(
out_block_desc
),
...
@@ -166,19 +158,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -166,19 +158,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
OutTileSizeW
,
OutTileSizeW
,
NPerThread
,
NPerThread
,
KPerThread
,
KPerThread
,
CPerThread
,
CPerThread
>
(
BlockSize
>
(
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
__syncthreads
();
__syncthreads
();
}
}
// copy output tensor from LDS to device mem
// copy output tensor from LDS to device mem
blockwise_4d_tensor_copy
<
TFloat
,
blockwise_4d_tensor_copy
<
BlockSize
>
(
decltype
(
out_block_desc
),
decltype
(
out_block_global_desc
),
decltype
(
out_block_desc
),
BlockSize
>
(
out_block_desc
,
out_block_desc
,
p_out_block
,
p_out_block
,
out_block_global_desc
,
out_block_global_desc
,
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
ac1f62be
...
@@ -151,11 +151,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -151,11 +151,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_4d_tensor_copy
<
TFloat
,
blockwise_4d_tensor_copy
<
BlockSize
>
(
in_global_desc
,
decltype
(
in_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
BlockSize
>
(
in_global_desc
,
p_in_global
+
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
...
@@ -166,11 +162,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -166,11 +162,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
in_block_desc
);
in_block_desc
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_copy
<
TFloat
,
blockwise_4d_tensor_copy
<
BlockSize
>
(
decltype
(
wei_global_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
BlockSize
>
(
wei_global_desc
,
wei_global_desc
,
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_block_desc
,
wei_block_desc
,
...
@@ -182,7 +174,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -182,7 +174,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
{
// threadwise convolution
// threadwise convolution
#if
0
#if
1
threadwise_direct_convolution_2
(
threadwise_direct_convolution_2
(
in_thread_block_desc
,
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment