Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
21c91816
Commit
21c91816
authored
Jan 04, 2019
by
Chao Liu
Browse files
added blockwise tensor reorder operation
parent
057c10e5
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
129 additions
and
51 deletions
+129
-51
driver/conv.cu
driver/conv.cu
+4
-1
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+1
-1
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+92
-47
src/include/constant_tensor_descriptor.cuh
src/include/constant_tensor_descriptor.cuh
+31
-1
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+1
-1
No files found.
driver/conv.cu
View file @
21c91816
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_direct_convolution_2.cuh"
//#include "device_implicit_gemm_convolution.cuh"
//#include "device_winograd_convolution.cuh"
//#include "device_winograd_convolution.cuh"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
...
@@ -366,7 +367,7 @@ int main()
...
@@ -366,7 +367,7 @@ int main()
auto
in_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
S
,
R
>
{});
auto
wei_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
S
,
R
>
{});
auto
out_desc
=
get_output_4d_tensor_descriptor
(
in_desc
,
wei_desc
);
auto
out_desc
=
get_
convolution_
output_4d_tensor_descriptor
(
in_desc
,
wei_desc
);
ostream_ConstantTensorDescriptor
(
in_desc
,
std
::
cout
<<
"in_desc: "
);
ostream_ConstantTensorDescriptor
(
in_desc
,
std
::
cout
<<
"in_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_desc
,
std
::
cout
<<
"wei_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_desc
,
std
::
cout
<<
"wei_desc: "
);
...
@@ -393,6 +394,8 @@ int main()
...
@@ -393,6 +394,8 @@ int main()
device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
#elif
1
#elif
1
device_direct_convolution_2
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_direct_convolution_2
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#elif 0
device_implicit_gemm_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#elif 0
#elif 0
device_winograd_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_winograd_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#endif
#endif
...
...
src/include/blockwise_direct_convolution.cuh
View file @
21c91816
...
@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
...
@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{});
constexpr
auto
out_thread_desc
=
constexpr
auto
out_thread_desc
=
get_output_4d_tensor_descriptor
(
in_thread_desc
,
wei_thread_desc
);
get_
convolution_
output_4d_tensor_descriptor
(
in_thread_desc
,
wei_thread_desc
);
constexpr
auto
in_thread_block_desc
=
constexpr
auto
in_thread_block_desc
=
make_ConstantTensorDescriptor
(
in_thread_desc
.
GetLengths
(),
in_block_desc
.
GetStrides
());
make_ConstantTensorDescriptor
(
in_thread_desc
.
GetLengths
(),
in_block_desc
.
GetStrides
());
...
...
src/include/blockwise_tensor_op.cuh
View file @
21c91816
...
@@ -2,7 +2,8 @@
...
@@ -2,7 +2,8 @@
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
template
<
class
TFloat
,
class
DstDesc
,
class
F
,
unsigned
BlockSize
>
template
<
class
TFloat
,
class
DstDesc
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_pointwise_op_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
__device__
void
blockwise_4d_tensor_pointwise_operation_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -75,82 +76,94 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri
...
@@ -75,82 +76,94 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri
}
}
}
}
template
<
class
TFloat
,
class
DescA
,
class
DescB
,
class
DescRef
,
class
F
,
unsigned
BlockSize
>
template
<
class
TFloat
,
__device__
void
blockwise_4d_tensor_pointwise_op_binary
(
class
SrcDesc
,
DescA
,
TFloat
*
const
__restrict__
p_a
,
DescB
,
TFloat
*
__restrict__
p_b
,
DescRef
,
F
f
)
class
DstDesc
,
class
RefDesc
,
class
Reorder
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
,
Reorder
,
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc_a
=
DescA
{};
constexpr
unsigned
IT0
=
Reorder
{}.
Get
(
I0
);
constexpr
auto
desc_b
=
DescB
{};
constexpr
unsigned
IT1
=
Reorder
{}.
Get
(
I1
);
constexpr
auto
desc_ref
=
DescRef
{};
constexpr
unsigned
IT2
=
Reorder
{}.
Get
(
I2
);
constexpr
unsigned
IT3
=
Reorder
{}.
Get
(
I3
);
#if 0
constexpr
auto
src_desc
=
SrcDesc
{};
if(threadIdx.x == 0)
constexpr
auto
dst_desc
=
DstDesc
{};
{
constexpr
auto
ref_desc
=
RefDesc
{};
print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: ");
print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: ");
print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: ");
}
#endif
constexpr
unsigned
NLoop
=
desc
_ref
.
GetElementSize
()
/
BlockSize
;
constexpr
unsigned
NLoop
=
ref_
desc
.
GetElementSize
()
/
BlockSize
;
for
(
unsigned
iloop
=
0
;
iloop
<
NLoop
;
++
iloop
)
for
(
unsigned
iloop
=
0
;
iloop
<
NLoop
;
++
iloop
)
{
{
unsigned
is
=
threadIdx
.
x
+
iloop
*
BlockSize
;
unsigned
is
=
threadIdx
.
x
+
iloop
*
BlockSize
;
const
unsigned
did0
=
is
/
desc_ref
.
GetStride
(
I0
);
unsigned
did
[
4
];
did
[
0
]
=
is
/
ref_desc
.
GetStride
(
I0
);
is
-=
did
0
*
desc_ref
.
GetStride
(
I0
);
is
-=
did
[
0
]
*
ref_desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc_ref
.
GetStride
(
I1
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
is
-=
did
1
*
desc_ref
.
GetStride
(
I1
);
is
-=
did
[
1
]
*
ref_desc
.
GetStride
(
I1
);
const
unsigned
did2
=
is
/
desc_ref
.
GetStride
(
I2
);
did
[
2
]
=
is
/
ref_desc
.
GetStride
(
I2
);
is
-=
did
2
*
desc_ref
.
GetStride
(
I2
);
is
-=
did
[
2
]
*
ref_desc
.
GetStride
(
I2
);
const
unsigned
did3
=
is
/
desc_ref
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
unsigned
aindex
=
desc
_a
.
Get1dIndex
(
did
0
,
did
1
,
did
2
,
did
3
);
const
unsigned
aindex
=
src_
desc
.
Get1dIndex
(
did
[
0
]
,
did
[
1
]
,
did
[
2
]
,
did
[
3
]
);
const
unsigned
bindex
=
desc
_b
.
Get1dIndex
(
did
0
,
did1
,
did2
,
did3
);
const
unsigned
bindex
=
dst_
desc
.
Get1dIndex
(
did
[
IT0
],
did
[
IT1
],
did
[
IT2
],
did
[
IT3
]
);
f
(
p_
a
[
aindex
],
p_
b
[
bindex
]);
f
(
p_
src
[
aindex
],
p_
dst
[
bindex
]);
}
}
constexpr
bool
has_tail
=
(
desc
_ref
.
GetElementSize
()
>
NLoop
*
BlockSize
);
constexpr
bool
has_tail
=
(
ref_
desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
if
(
has_tail
)
if
(
has_tail
)
{
{
unsigned
is
=
threadIdx
.
x
+
NLoop
*
BlockSize
;
unsigned
is
=
threadIdx
.
x
+
NLoop
*
BlockSize
;
if
(
is
<
desc
_ref
.
GetElementSize
())
if
(
is
<
ref_
desc
.
GetElementSize
())
{
{
const
unsigned
did
0
=
is
/
desc_ref
.
GetStride
(
I0
)
;
unsigned
did
[
4
]
;
is
-=
did0
*
desc_ref
.
GetStride
(
I0
);
did
[
0
]
=
is
/
ref_desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc_ref
.
GetStride
(
I
1
);
is
-=
did
[
0
]
*
ref_desc
.
GetStride
(
I
0
);
is
-=
did1
*
desc_ref
.
GetStride
(
I1
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
const
unsigned
did2
=
is
/
desc_ref
.
GetStride
(
I
2
);
is
-=
did
[
1
]
*
ref_desc
.
GetStride
(
I
1
);
is
-=
did2
*
desc_ref
.
GetStride
(
I2
);
did
[
2
]
=
is
/
ref_desc
.
GetStride
(
I2
);
const
unsigned
did3
=
is
/
desc_ref
.
GetStride
(
I
3
);
is
-=
did
[
2
]
*
ref_desc
.
GetStride
(
I
2
);
const
unsigned
aindex
=
desc
_a
.
Get
1dIndex
(
did0
,
did1
,
did2
,
did
3
);
did
[
3
]
=
is
/
ref_
desc
.
Get
Stride
(
I
3
);
const
unsigned
b
index
=
desc
_b
.
Get1dIndex
(
did
0
,
did
1
,
did
2
,
did
3
);
const
unsigned
a
index
=
src_
desc
.
Get1dIndex
(
did
[
0
]
,
did
[
1
]
,
did
[
2
]
,
did
[
3
]
);
f
(
p_a
[
aindex
],
p_b
[
bindex
]);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
IT0
],
did
[
IT1
],
did
[
IT2
],
did
[
IT3
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
}
}
}
}
...
@@ -160,21 +173,53 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst
...
@@ -160,21 +173,53 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst
{
{
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
blockwise_4d_tensor_pointwise_op_unary
<
TFloat
,
DstDesc
,
decltype
(
f_set_zero
),
BlockSize
>
(
blockwise_4d_tensor_pointwise_op
eration
_unary
<
TFloat
,
DstDesc
,
decltype
(
f_set_zero
),
BlockSize
>
(
DstDesc
{},
p_dst
,
f_set_zero
);
DstDesc
{},
p_dst
,
f_set_zero
);
}
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_copy_reorder
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
blockwise_4d_tensor_pointwise_operation_binary_reorder
<
TFloat
,
SrcDesc
,
DstDesc
,
RefDesc
,
Reorder
,
decltype
(
f_copy
),
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
unsigned
BlockSize
>
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_copy
(
__device__
void
blockwise_4d_tensor_copy
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
)
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
RefDesc
)
{
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
constexpr
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{
};
blockwise_4d_tensor_
pointwise_op_binary
<
TFloat
,
blockwise_4d_tensor_
copy_reorder
<
TFloat
,
SrcDesc
,
SrcDesc
,
DstDesc
,
DstDesc
,
RefDesc
,
RefDesc
,
decltype
(
f_copy
),
decltype
(
reorder
),
BlockSize
>
(
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
}
template
<
class
TFloat
,
class
ImDesc
,
class
WDesc
,
class
ColDesc
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_im2col
(
ImDesc
,
const
__restrict__
TFloat
*
p_im
,
WDesc
,
ColDesc
,
__restrict__
TFloat
*
p_col
)
{
// do nothing
}
}
src/include/constant_tensor_descriptor.cuh
View file @
21c91816
...
@@ -22,6 +22,14 @@ struct Sequence
...
@@ -22,6 +22,14 @@ struct Sequence
{
{
return
mData
[
I
];
return
mData
[
I
];
}
}
template
<
unsigned
I
>
__host__
__device__
constexpr
auto
GetNumber
(
Number
<
I
>
)
const
{
constexpr
unsigned
N
=
Get
(
I
);
return
Number
<
N
>
{};
}
};
};
template
<
class
Lengths
,
class
Strides
>
template
<
class
Lengths
,
class
Strides
>
...
@@ -113,9 +121,31 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
...
@@ -113,9 +121,31 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
}
// this is ugly, only for 4d
template
<
class
Desc
,
class
Reorder
>
__host__
__device__
constexpr
auto
get_reordered_4d_tensor_descriptor
(
Desc
,
Reorder
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
IT0
=
Reorder
{}.
GetNumber
(
I0
);
constexpr
auto
IT1
=
Reorder
{}.
GetNumber
(
I1
);
constexpr
auto
IT2
=
Reorder
{}.
GetNumber
(
I2
);
constexpr
auto
IT3
=
Reorder
{}.
GetNumber
(
I3
);
constexpr
unsigned
L0
=
Desc
{}.
GetLength
(
IT0
);
constexpr
unsigned
L1
=
Desc
{}.
GetLength
(
IT1
);
constexpr
unsigned
L2
=
Desc
{}.
GetLength
(
IT2
);
constexpr
unsigned
L3
=
Desc
{}.
GetLength
(
IT3
);
return
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
{});
}
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
template
<
class
InDesc
,
class
WeiDesc
>
__host__
__device__
constexpr
auto
get_output_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
__host__
__device__
constexpr
auto
get_
convolution_
output_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
{
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
21c91816
...
@@ -70,7 +70,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -70,7 +70,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{},
wei_block_desc
.
GetStrides
());
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{},
wei_block_desc
.
GetStrides
());
constexpr
auto
out_thread_desc
=
constexpr
auto
out_thread_desc
=
get_output_4d_tensor_descriptor
(
in_thread_block_desc
,
wei_thread_block_desc
);
get_
convolution_
output_4d_tensor_descriptor
(
in_thread_block_desc
,
wei_thread_block_desc
);
// register
// register
TFloat
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
TFloat
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment