Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
e7b8705b
Commit
e7b8705b
authored
Jan 15, 2019
by
Chao Liu
Browse files
adding implicit gemm
parent
84d9802d
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
510 additions
and
231 deletions
+510
-231
driver/conv.cu
driver/conv.cu
+86
-68
driver/device_implicit_gemm_convolution.cuh
driver/device_implicit_gemm_convolution.cuh
+42
-23
src/include/ConstantTensorDescriptor.cuh
src/include/ConstantTensorDescriptor.cuh
+23
-17
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+33
-28
src/include/common.cuh
src/include/common.cuh
+14
-32
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+3
-3
src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh
src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh
+43
-21
src/include/gridwise_implicit_gemm_convolution_nchw_srck.cuh
src/include/gridwise_implicit_gemm_convolution_nchw_srck.cuh
+219
-0
src/include/threadwise_direct_convolution.cuh
src/include/threadwise_direct_convolution.cuh
+12
-9
src/include/threadwise_tensor_op.cuh
src/include/threadwise_tensor_op.cuh
+35
-30
No files found.
driver/conv.cu
View file @
e7b8705b
...
@@ -85,19 +85,19 @@ auto make_TensorDescriptor(TConstTensorDesc)
...
@@ -85,19 +85,19 @@ auto make_TensorDescriptor(TConstTensorDesc)
}
}
template
<
class
T
>
template
<
class
T
>
void
host_direct_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
void
host_direct_convolution
(
const
Tensor
<
T
>&
in
_nchw
,
const
Tensor
<
T
>&
wei
_kcsr
,
Tensor
<
T
>&
out
)
{
{
auto
f
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
auto
f
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
int
c
=
0
;
c
<
wei
_kcsr
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
for
(
int
y
=
0
;
y
<
wei
_kcsr
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
{
int
hi
=
ho
+
y
;
int
hi
=
ho
+
y
;
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
for
(
int
x
=
0
;
x
<
wei
_kcsr
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
{
int
wi
=
wo
+
x
;
int
wi
=
wo
+
x
;
v
+=
in
(
n
,
c
,
hi
,
wi
)
*
wei
(
k
,
c
,
y
,
x
);
v
+=
in
_nchw
(
n
,
c
,
hi
,
wi
)
*
wei
_kcsr
(
k
,
c
,
y
,
x
);
}
}
}
}
}
}
...
@@ -114,19 +114,21 @@ void host_direct_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T
...
@@ -114,19 +114,21 @@ void host_direct_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T
}
}
template
<
class
T
>
template
<
class
T
>
void
host_winograd_3x3_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
)
void
host_winograd_3x3_convolution
(
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
wei_kcsr
,
Tensor
<
T
>&
out
)
{
{
constexpr
std
::
size_t
OutTileSizeH
=
2
;
constexpr
std
::
size_t
OutTileSizeH
=
2
;
constexpr
std
::
size_t
OutTileSizeW
=
2
;
constexpr
std
::
size_t
OutTileSizeW
=
2
;
std
::
size_t
N
=
in
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
N
=
in
_nchw
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
C
=
in
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
C
=
in
_nchw
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
HI
=
in
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
HI
=
in
_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
WI
=
in
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
WI
=
in
_nchw
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
K
=
wei
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
K
=
wei
_kcsr
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
S
=
wei
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
S
=
wei
_kcsr
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
R
=
wei
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
R
=
wei
_kcsr
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
HO
=
out
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
HO
=
out
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
WO
=
out
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
WO
=
out
.
mDesc
.
GetLengths
()[
3
];
...
@@ -150,7 +152,7 @@ void host_winograd_3x3_convolution(const Tensor<T>& in, const Tensor<T>& wei, Te
...
@@ -150,7 +152,7 @@ void host_winograd_3x3_convolution(const Tensor<T>& in, const Tensor<T>& wei, Te
for
(
int
i
=
0
;
i
<
InTileSizeW
;
++
i
)
for
(
int
i
=
0
;
i
<
InTileSizeW
;
++
i
)
{
{
std
::
size_t
wi
=
OutTileSizeW
*
x
+
i
;
std
::
size_t
wi
=
OutTileSizeW
*
x
+
i
;
in_hold
(
n
,
c
,
y
,
x
,
j
,
i
)
=
in
(
n
,
c
,
hi
,
wi
);
in_hold
(
n
,
c
,
y
,
x
,
j
,
i
)
=
in
_nchw
(
n
,
c
,
hi
,
wi
);
}
}
}
}
};
};
...
@@ -194,45 +196,49 @@ void host_winograd_3x3_convolution(const Tensor<T>& in, const Tensor<T>& wei, Te
...
@@ -194,45 +196,49 @@ void host_winograd_3x3_convolution(const Tensor<T>& in, const Tensor<T>& wei, Te
};
};
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
wei_transform
(
k
,
c
,
0
,
0
)
=
wei
(
k
,
c
,
0
,
0
);
wei_transform
(
k
,
c
,
0
,
0
)
=
wei
_kcsr
(
k
,
c
,
0
,
0
);
wei_transform
(
k
,
c
,
0
,
1
)
=
wei_transform
(
k
,
c
,
0
,
1
)
=
0.5
*
wei
(
k
,
c
,
0
,
0
)
+
0.5
*
wei
(
k
,
c
,
0
,
1
)
+
0.5
*
wei
(
k
,
c
,
0
,
2
);
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
0
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
1
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
2
)
=
wei_transform
(
k
,
c
,
0
,
2
)
=
0.5
*
wei
(
k
,
c
,
0
,
0
)
-
0.5
*
wei
(
k
,
c
,
0
,
1
)
+
0.5
*
wei
(
k
,
c
,
0
,
2
);
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
0
)
-
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
1
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
3
)
=
wei
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
3
)
=
wei
_kcsr
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
1
,
0
)
=
wei_transform
(
k
,
c
,
1
,
0
)
=
0.5
*
wei
(
k
,
c
,
0
,
0
)
+
0.5
*
wei
(
k
,
c
,
1
,
0
)
+
0.5
*
wei
(
k
,
c
,
2
,
0
);
0.5
*
wei_kcsr
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kcsr
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kcsr
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
1
,
1
)
=
wei_transform
(
k
,
c
,
1
,
1
)
=
0.25
*
wei_kcsr
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kcsr
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
0
,
0
)
+
0.25
*
wei
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
1
,
0
)
+
0.25
*
wei
(
k
,
c
,
1
,
0
)
+
0.25
*
wei
(
k
,
c
,
1
,
1
)
+
0.25
*
wei
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kcsr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei
(
k
,
c
,
2
,
0
)
+
0.25
*
wei
(
k
,
c
,
2
,
1
)
+
0.25
*
wei
(
k
,
c
,
2
,
2
);
0.25
*
wei_kcsr
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kcsr
(
k
,
c
,
2
,
1
)
+
wei_transform
(
k
,
c
,
1
,
2
)
=
0.25
*
wei_kcsr
(
k
,
c
,
2
,
2
);
0.25
*
wei
(
k
,
c
,
0
,
0
)
-
0.25
*
wei
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
0
,
2
)
+
wei_transform
(
k
,
c
,
1
,
2
)
=
0.25
*
wei_kcsr
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kcsr
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
1
,
0
)
-
0.25
*
wei
(
k
,
c
,
1
,
1
)
+
0.25
*
wei
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
1
,
0
)
-
0.25
*
wei
(
k
,
c
,
2
,
0
)
-
0.25
*
wei
(
k
,
c
,
2
,
1
)
+
0.25
*
wei
(
k
,
c
,
2
,
2
);
0.25
*
wei_kcsr
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kcsr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kcsr
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kcsr
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
1
,
3
)
=
wei_transform
(
k
,
c
,
1
,
3
)
=
0.5
*
wei
(
k
,
c
,
0
,
2
)
+
0.5
*
wei
(
k
,
c
,
1
,
2
)
+
0.5
*
wei
(
k
,
c
,
2
,
2
);
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
2
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
1
,
2
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
2
,
0
)
=
wei_transform
(
k
,
c
,
2
,
0
)
=
0.5
*
wei
(
k
,
c
,
0
,
0
)
-
0.5
*
wei
(
k
,
c
,
1
,
0
)
+
0.5
*
wei
(
k
,
c
,
2
,
0
);
0.5
*
wei_kcsr
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kcsr
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kcsr
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
2
,
1
)
=
wei_transform
(
k
,
c
,
2
,
1
)
=
0.25
*
wei_kcsr
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kcsr
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
0
,
0
)
+
0.25
*
wei
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kcsr
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kcsr
(
k
,
c
,
1
,
0
)
-
0.25
*
wei
(
k
,
c
,
1
,
0
)
-
0.25
*
wei
(
k
,
c
,
1
,
1
)
-
0.25
*
wei
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kcsr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei
(
k
,
c
,
2
,
0
)
+
0.25
*
wei
(
k
,
c
,
2
,
1
)
+
0.25
*
wei
(
k
,
c
,
2
,
2
);
0.25
*
wei_kcsr
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kcsr
(
k
,
c
,
2
,
1
)
+
wei_transform
(
k
,
c
,
2
,
2
)
=
0.25
*
wei_kcsr
(
k
,
c
,
2
,
2
);
0.25
*
wei
(
k
,
c
,
0
,
0
)
-
0.25
*
wei
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
0
,
2
)
-
wei_transform
(
k
,
c
,
2
,
2
)
=
0.25
*
wei_kcsr
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kcsr
(
k
,
c
,
0
,
1
)
+
0.25
*
wei
(
k
,
c
,
1
,
0
)
+
0.25
*
wei
(
k
,
c
,
1
,
1
)
-
0.25
*
wei
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kcsr
(
k
,
c
,
1
,
0
)
+
0.25
*
wei
(
k
,
c
,
2
,
0
)
-
0.25
*
wei
(
k
,
c
,
2
,
1
)
+
0.25
*
wei
(
k
,
c
,
2
,
2
);
0.25
*
wei_kcsr
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kcsr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kcsr
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kcsr
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kcsr
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
2
,
3
)
=
wei_transform
(
k
,
c
,
2
,
3
)
=
0.5
*
wei
(
k
,
c
,
0
,
2
)
-
0.5
*
wei
(
k
,
c
,
1
,
2
)
+
0.5
*
wei
(
k
,
c
,
2
,
2
);
0.5
*
wei
_kcsr
(
k
,
c
,
0
,
2
)
-
0.5
*
wei
_kcsr
(
k
,
c
,
1
,
2
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
0
)
=
wei
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
3
,
0
)
=
wei
_kcsr
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
3
,
1
)
=
wei_transform
(
k
,
c
,
3
,
1
)
=
0.5
*
wei
(
k
,
c
,
2
,
0
)
+
0.5
*
wei
(
k
,
c
,
2
,
1
)
+
0.5
*
wei
(
k
,
c
,
2
,
2
);
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
0
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
1
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
2
)
=
wei_transform
(
k
,
c
,
3
,
2
)
=
0.5
*
wei
(
k
,
c
,
2
,
0
)
-
0.5
*
wei
(
k
,
c
,
2
,
1
)
+
0.5
*
wei
(
k
,
c
,
2
,
2
);
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
0
)
-
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
1
)
+
0.5
*
wei
_kcsr
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
3
)
=
wei
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
3
)
=
wei
_kcsr
(
k
,
c
,
2
,
2
);
};
};
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
y
,
auto
x
)
{
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
y
,
auto
x
)
{
...
@@ -366,54 +372,66 @@ int main()
...
@@ -366,54 +372,66 @@ int main()
constexpr
unsigned
R
=
3
;
constexpr
unsigned
R
=
3
;
#endif
#endif
auto
in_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_nchw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
S
,
R
>
{});
auto
wei_kcsr_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
S
,
R
>
{});
auto
out_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_desc
,
wei_desc
);
auto
wei_srck_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
C
,
K
>
{});
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcsr_desc
);
ostream_ConstantTensorDescriptor
(
in_desc
,
std
::
cout
<<
"in_desc: "
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_desc
,
std
::
cout
<<
"wei_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcsr_desc
,
std
::
cout
<<
"wei_kcsr_desc: "
);
ostream_ConstantTensorDescriptor
(
out_desc
,
std
::
cout
<<
"out_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_srck_desc
,
std
::
cout
<<
"wei_srck_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
Tensor
<
float
>
in
(
make_TensorDescriptor
(
in_desc
));
Tensor
<
float
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
float
>
wei
(
make_TensorDescriptor
(
wei_desc
));
Tensor
<
float
>
wei_kcsr
(
make_TensorDescriptor
(
wei_kcsr_desc
));
Tensor
<
float
>
out_host
(
make_TensorDescriptor
(
out_desc
));
Tensor
<
float
>
wei_srck
(
make_TensorDescriptor
(
wei_srck_desc
));
Tensor
<
float
>
out_device
(
make_TensorDescriptor
(
out_desc
));
Tensor
<
float
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
float
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
#if 0
#if 0
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = std::thread::hardware_concurrency();
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_srck.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
1
#elif
1
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcsr
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_srck
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
#endif
for
(
int
i
=
0
;
i
<
40
;
++
i
)
for
(
int
i
=
0
;
i
<
40
;
++
i
)
{
{
#if 0
#if 0
device_direct_convolution_1(in_desc, in
, wei
_desc, wei, out_desc, out_device);
device_direct_convolution_1(in_
nchw_
desc, in
_nchw, wei_kcsr
_desc, wei
_kcsr
, out_
nkhw_
desc, out_
nkhw_
device);
#elif
0
#elif
0
device_direct_convolution_2
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_direct_convolution_2
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#elif 0
device_implicit_gemm_convolution
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#elif 1
#elif 1
device_implicit_gemm_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_implicit_gemm_convolution
(
in_nchw_desc
,
in_nchw
,
wei_srck_desc
,
wei_srck
,
out_nkhw_desc
,
out_nkhw_device
);
#elif 0
#elif 0
device_winograd_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_winograd_convolution
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#endif
#endif
}
}
#if 1
#if 1
host_winograd_3x3_convolution
(
in
,
wei
,
out
_host
);
host_winograd_3x3_convolution
(
in
_nchw
,
wei_kcsr
,
out_nkhw
_host
);
check_error
(
out_host
,
out_device
);
check_error
(
out_
nkhw_
host
,
out_
nkhw_
device
);
#elif 0
#elif 0
host_direct_convolution
(
in
,
wei
,
out
_host
);
host_direct_convolution
(
in
_nchw
,
wei_kcsr
,
out_nkhw
_host
);
check_error
(
out_host
,
out_device
);
check_error
(
out_
nkhw_
host
,
out_
nkhw_
device
);
#endif
#endif
#if 0
#if 0
LogRange(std::cout << "in : ", in.mData, ",") << std::endl;
LogRange(std::cout << "in
_nchw
: ", in
_nchw
.mData, ",") << std::endl;
LogRange(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRange(std::cout << "wei
_kcsr
: ", wei
_kcsr
.mData, ",") << std::endl;
LogRange(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRange(std::cout << "out_
nkhw_
host : ", out_
nkhw_
host.mData, ",") << std::endl;
LogRange(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
LogRange(std::cout << "out_
nkhw_
device: ", out_
nkhw_
device.mData, ",") << std::endl;
#endif
#endif
}
}
driver/device_implicit_gemm_convolution.cuh
View file @
e7b8705b
#pragma once
#pragma once
#include "gridwise_implicit_gemm_convolution.cuh"
#include "gridwise_implicit_gemm_convolution_nchw_kcsr.cuh"
#include "gridwise_implicit_gemm_convolution_nchw_srck.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution
(
void
device_implicit_gemm_convolution
(
...
@@ -25,7 +26,7 @@ void device_implicit_gemm_convolution(
...
@@ -25,7 +26,7 @@ void device_implicit_gemm_convolution(
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
#if
1
#if
0
constexpr unsigned NPerBlock = 2;
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 64;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned CPerBlock = 4;
...
@@ -39,6 +40,20 @@ void device_implicit_gemm_convolution(
...
@@ -39,6 +40,20 @@ void device_implicit_gemm_convolution(
constexpr unsigned WoPerThread = 4;
constexpr unsigned WoPerThread = 4;
constexpr unsigned BlockSize = 256;
constexpr unsigned BlockSize = 256;
#elif
1
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
constexpr
unsigned
GridSize
=
constexpr
unsigned
GridSize
=
...
@@ -56,7 +71,12 @@ void device_implicit_gemm_convolution(
...
@@ -56,7 +71,12 @@ void device_implicit_gemm_convolution(
cudaEventCreate
(
&
start
);
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_nchw_kcsr
<
GridSize
,
#if 0
gridwise_implicit_gemm_convolution_nchw_kcsr
#elif
1
gridwise_implicit_gemm_convolution_nchw_srck
#endif
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
InDesc
,
InDesc
,
...
@@ -70,8 +90,7 @@ void device_implicit_gemm_convolution(
...
@@ -70,8 +90,7 @@ void device_implicit_gemm_convolution(
KPerThread
,
KPerThread
,
CPerThread
,
CPerThread
,
HoPerThread
,
HoPerThread
,
WoPerThread
>
WoPerThread
><<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
WeiDesc
{},
WeiDesc
{},
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
...
...
src/include/ConstantTensorDescriptor.cuh
View file @
e7b8705b
#pragma once
#pragma once
#include "common.cuh"
#include "common.cuh"
// this is ugly, only for 4d
template
<
unsigned
L0
,
unsigned
L1
,
unsigned
L2
,
unsigned
L3
>
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
)
{
return
Sequence
<
L1
*
L2
*
L3
,
L2
*
L3
,
L3
,
1
>
{};
}
// this is ugly, only for 4d
template
<
unsigned
S0
,
unsigned
S1
,
unsigned
S2
,
unsigned
S3
>
__host__
__device__
constexpr
auto
calculate_full_lengths
(
Sequence
<
S0
,
S1
,
S2
,
S3
>
)
{
static_assert
((
S0
%
S1
==
0
)
&&
(
S1
%
S2
==
0
)
&&
(
S2
%
S3
==
0
),
"cannot be evenly divided!"
);
return
Sequence
<
1
,
S0
/
S1
,
S1
/
S2
,
S2
/
S3
>
{};
}
template
<
class
Lengths
,
class
Strides
>
template
<
class
Lengths
,
class
Strides
>
struct
ConstantTensorDescriptor
struct
ConstantTensorDescriptor
{
{
...
@@ -69,23 +85,13 @@ struct ConstantTensorDescriptor
...
@@ -69,23 +85,13 @@ struct ConstantTensorDescriptor
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
);
}
}
};
// this is ugly, only for 4d
__host__
__device__
constexpr
auto
Condense
()
const
template
<
unsigned
L0
,
unsigned
L1
,
unsigned
L2
,
unsigned
L3
>
{
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
)
constexpr
auto
default_strides
=
calculate_default_strides
(
Lengths
{});
{
return
ConstantTensorDescriptor
<
Lengths
,
decltype
(
default_strides
)
>
{};
return
Sequence
<
L1
*
L2
*
L3
,
L2
*
L3
,
L3
,
1
>
{};
}
}
};
// this is ugly, only for 4d
template
<
unsigned
S0
,
unsigned
S1
,
unsigned
S2
,
unsigned
S3
>
__host__
__device__
constexpr
auto
calculate_full_lengths
(
Sequence
<
S0
,
S1
,
S2
,
S3
>
)
{
static_assert
((
S0
%
S1
==
0
)
&&
(
S1
%
S2
==
0
)
&&
(
S2
%
S3
==
0
),
"cannot be evenly divided!"
);
return
Sequence
<
1
,
S0
/
S1
,
S1
/
S2
,
S2
/
S3
>
{};
}
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
(
Lengths
)
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
(
Lengths
)
...
...
src/include/blockwise_tensor_op.cuh
View file @
e7b8705b
...
@@ -83,16 +83,16 @@ template <unsigned BlockSize,
...
@@ -83,16 +83,16 @@ template <unsigned BlockSize,
class
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
RefDesc
,
class
SrcOpLengths
,
class
Reorder
,
class
DstFromSrc
Reorder
,
class
F
>
class
F
>
__device__
void
__device__
void
blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
blockwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
SrcDesc
,
Float
*
const
__restrict__
p_src
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Float
*
__restrict__
p_dst
,
RefDesc
,
SrcOpLengths
,
Reorder
,
DstFromSrc
Reorder
,
F
f
)
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
...
@@ -100,14 +100,14 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -100,14 +100,14 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
unsigned
I
T
0
=
Reorder
{}.
Get
(
I0
);
constexpr
unsigned
I
R
0
=
DstFromSrc
Reorder
{}.
Get
(
I0
);
constexpr
unsigned
I
T
1
=
Reorder
{}.
Get
(
I1
);
constexpr
unsigned
I
R
1
=
DstFromSrc
Reorder
{}.
Get
(
I1
);
constexpr
unsigned
I
T
2
=
Reorder
{}.
Get
(
I2
);
constexpr
unsigned
I
R
2
=
DstFromSrc
Reorder
{}.
Get
(
I2
);
constexpr
unsigned
I
T
3
=
Reorder
{}.
Get
(
I3
);
constexpr
unsigned
I
R
3
=
DstFromSrc
Reorder
{}.
Get
(
I3
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
RefDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}
)
;
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
...
@@ -133,7 +133,7 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -133,7 +133,7 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
I
T
0
],
did
[
I
T
1
],
did
[
I
T
2
],
did
[
I
T
3
]);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
I
R
0
],
did
[
I
R
1
],
did
[
I
R
2
],
did
[
I
R
3
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
...
@@ -164,7 +164,7 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -164,7 +164,7 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
I
T
0
],
did
[
I
T
1
],
did
[
I
T
2
],
did
[
I
T
3
]);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
I
R
0
],
did
[
I
R
1
],
did
[
I
R
2
],
did
[
I
R
3
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
...
@@ -183,23 +183,28 @@ template <unsigned BlockSize,
...
@@ -183,23 +183,28 @@ template <unsigned BlockSize,
class
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
RefDesc
,
class
SrcOpLengths
,
class
Reorder
>
class
DstFromSrcReorder
>
__device__
void
blockwise_4d_tensor_copy_reorder
(
__device__
void
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
DstFromSrcReorder
)
{
{
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
blockwise_4d_tensor_pointwise_operation_binary_reorder
<
BlockSize
>
(
blockwise_4d_tensor_pointwise_operation_binary_reorder
_by_get_dst_from_src
<
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
DstFromSrc
Reorder
{},
f_copy
);
}
}
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
__device__
void
blockwise_4d_tensor_copy
(
__device__
void
blockwise_4d_tensor_copy
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
)
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
)
{
{
constexpr
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
constexpr
auto
dst_from_src_
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
blockwise_4d_tensor_copy_reorder
<
BlockSize
>
(
blockwise_4d_tensor_copy_reorder
_by_get_dst_from_src
<
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
dst_from_src_
reorder
);
}
}
src/include/common.cuh
View file @
e7b8705b
...
@@ -30,6 +30,8 @@ using Number = Constant<unsigned, N>;
...
@@ -30,6 +30,8 @@ using Number = Constant<unsigned, N>;
template
<
unsigned
...
Is
>
template
<
unsigned
...
Is
>
struct
Sequence
struct
Sequence
{
{
using
Type
=
Sequence
<
Is
...
>
;
static
constexpr
unsigned
nDim
=
sizeof
...(
Is
);
static
constexpr
unsigned
nDim
=
sizeof
...(
Is
);
const
unsigned
mData
[
nDim
]
=
{
Is
...};
const
unsigned
mData
[
nDim
]
=
{
Is
...};
...
@@ -40,44 +42,24 @@ struct Sequence
...
@@ -40,44 +42,24 @@ struct Sequence
return
mData
[
I
];
return
mData
[
I
];
}
}
template
<
unsigned
I0
,
unsigned
I1
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
)
const
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
return
Sequence
<
IR0
,
IR1
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
,
Number
<
I2
>
)
const
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
return
Sequence
<
IR0
,
IR1
,
IR2
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
Reorder
(
Number
<
I0
>
,
Number
<
I1
>
,
Number
<
I2
>
,
Number
<
I3
>
)
const
__host__
__device__
constexpr
auto
Reorder
ByGetNewFromOld
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
{
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
constexpr
auto
old_sequence
=
Type
{};
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
constexpr
unsigned
IR3
=
Get
(
Number
<
I3
>
{});
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
>
{};
constexpr
unsigned
NR0
=
old_sequence
.
mData
[
I0
];
constexpr
unsigned
NR1
=
old_sequence
.
mData
[
I1
];
constexpr
unsigned
NR2
=
old_sequence
.
mData
[
I2
];
constexpr
unsigned
NR3
=
old_sequence
.
mData
[
I3
];
return
Sequence
<
NR0
,
NR1
,
NR2
,
NR3
>
{};
}
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
Reorder
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
__host__
__device__
constexpr
auto
Reorder
ByPutOldToNew
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
{
{
constexpr
unsigned
IR0
=
Get
(
Number
<
I0
>
{});
// don't know how to implement this
constexpr
unsigned
IR1
=
Get
(
Number
<
I1
>
{});
printf
(
"Sequence::ReorderByPutOldToNew not implemented"
);
constexpr
unsigned
IR2
=
Get
(
Number
<
I2
>
{});
assert
(
false
);
constexpr
unsigned
IR3
=
Get
(
Number
<
I3
>
{});
return
Sequence
<
IR0
,
IR1
,
IR2
,
IR3
>
{};
}
}
};
};
src/include/gridwise_direct_convolution_2.cuh
View file @
e7b8705b
...
@@ -159,7 +159,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -159,7 +159,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
wi_block_data_begin
),
wi_block_data_begin
),
in_block_desc
,
in_block_desc
,
p_in_block
,
p_in_block
,
in_block_desc
);
in_block_desc
.
GetLengths
()
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_copy
<
BlockSize
>
(
blockwise_4d_tensor_copy
<
BlockSize
>
(
...
@@ -167,7 +167,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -167,7 +167,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_block_desc
,
wei_block_desc
,
p_wei_block
,
p_wei_block
,
wei_block_desc
);
wei_block_desc
.
GetLengths
()
);
__syncthreads
();
__syncthreads
();
...
@@ -209,5 +209,5 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -209,5 +209,5 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_thread_desc
);
out_thread_desc
.
GetLengths
()
);
}
}
src/include/gridwise_implicit_gemm_convolution.cuh
→
src/include/gridwise_implicit_gemm_convolution
_nchw_kcsr
.cuh
View file @
e7b8705b
...
@@ -74,17 +74,39 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -74,17 +74,39 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// tensor view of blockwise input and weight
in LDS
// tensor view of
un-reorderd
blockwise input and weight
(imaginary)
constexpr
auto
wei_srck
_block_desc
=
constexpr
auto
in_nchw
_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
C
PerBlock
,
K
PerBlock
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
Hi
PerBlock
,
Wi
PerBlock
>
{});
constexpr
auto
in_chwn_block_desc
=
constexpr
auto
wei_kcsr_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
HiPerBlock
,
WiPerBlock
,
NPerBlock
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
S
,
R
>
{});
// tensor view of reordered blockwise input and weight in LDS
constexpr
auto
reorder_chwn_from_nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
constexpr
auto
in_chwn_block_desc
=
make_ConstantTensorDescriptor
(
in_nchw_block_desc
.
GetLengths
().
ReorderByGetNewFromOld
(
reorder_chwn_from_nchw
));
constexpr
auto
reorder_srck_from_kcsr
=
Sequence
<
2
,
3
,
1
,
0
>
{};
constexpr
auto
wei_srck_block_desc
=
make_ConstantTensorDescriptor
(
wei_kcsr_block_desc
.
GetLengths
().
ReorderByGetNewFromOld
(
reorder_srck_from_kcsr
));
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_hkwn_thread_desc
=
constexpr
auto
out_hkwn_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
HoPerThread
,
KPerThread
,
WoPerThread
,
NPerThread
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
HoPerThread
,
KPerThread
,
WoPerThread
,
NPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_nchw_block_desc, "in_nchw_block_desc");
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_kcsr_block_desc, "wei_kcsr_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// a series of blockwise batched GEMM
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
...
@@ -97,7 +119,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -97,7 +119,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I
1
)
>
{});
// constexpr doesn't compile
Number
<
in_chwn_block_desc
.
GetStride
(
I
0
)
>
{});
// constexpr doesn't compile
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
...
@@ -137,11 +159,10 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -137,11 +159,10 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
in_nchw_global_desc
.
GetLength
(
I1
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
in_nchw_global_desc
.
GetLength
(
I1
);
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
{
#if 1
// input: global mem to LDS,
// input: global mem to LDS,
// convert 4d-tensor in[N,C,Hi,Wi] to matrix in_matrix[C,Hi*Wi*N]
// convert 4d-tensor in[N,C,Hi,Wi] to matrix in_matrix[C,Hi*Wi*N]
constexpr
auto
reorder_nchw2chwn
=
Sequence
<
3
,
0
,
1
,
2
>
{};
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
blockwise_4d_tensor_copy_reorder
<
BlockSize
>
(
in_nchw_global_desc
,
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
...
@@ -149,21 +170,22 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -149,21 +170,22 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
wi_block_data_begin
),
wi_block_data_begin
),
in_chwn_block_desc
,
in_chwn_block_desc
,
p_in_block
,
p_in_block
,
in_chwn_block_desc
,
in_nchw_block_desc
.
GetLengths
(),
reorder_nchw2chwn
);
reorder_chwn_from_nchw
);
#endif
#if 1
// weight: global mem to LDS,
// weight: global mem to LDS,
// convert 4d-tensor wei[K,C,S,R] to matrix wei_matrix[S*R*C,K]
// convert 4d-tensor wei[K,C,S,R] to matrix wei_matrix[S*R*C,K]
constexpr
auto
reorder_kcsr2srck
=
Sequence
<
3
,
2
,
0
,
1
>
{};
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
blockwise_4d_tensor_copy_reorder
<
BlockSize
>
(
wei_kcsr_global_desc
,
wei_kcsr_global_desc
,
p_wei_global
+
p_wei_global
+
wei_kcsr_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_kcsr_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_srck_block_desc
,
wei_srck_block_desc
,
p_wei_block
,
p_wei_block
,
wei_srck_block_desc
,
wei_kcsr_block_desc
.
GetLengths
(),
reorder_kcsr2srck
);
reorder_srck_from_kcsr
);
#endif
__syncthreads
();
__syncthreads
();
...
@@ -187,10 +209,10 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -187,10 +209,10 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
row_begin
/
NPerThread
;
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
row_begin
/
NPerThread
;
// output: register to global mem,
// output: register to global mem,
// convert
matrix out_matrix
[Ho
*
K,Wo
*
N] to
4d-tensor out
[N,K,Ho,Wo]
// convert
out_thread
[Ho
,
K,Wo
,
N] to
out_global
[N,K,Ho,Wo]
constexpr
auto
reorder_
hkwn2nkhw
=
Sequence
<
2
,
1
,
3
,
0
>
{};
constexpr
auto
reorder_
nkhw_from_hkwn
=
Sequence
<
3
,
1
,
0
,
2
>
{};
threadwise_4d_tensor_copy_reorder
(
threadwise_4d_tensor_copy_reorder
_by_get_dst_from_src
(
out_hkwn_thread_desc
,
out_hkwn_thread_desc
,
p_out_thread
,
p_out_thread
,
out_nkhw_global_desc
,
out_nkhw_global_desc
,
...
@@ -198,6 +220,6 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -198,6 +220,6 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_hkwn_thread_desc
,
out_hkwn_thread_desc
.
GetLengths
()
,
reorder_
hkwn2nkhw
);
reorder_
nkhw_from_hkwn
);
}
}
src/include/gridwise_implicit_gemm_convolution_nchw_srck.cuh
0 → 100644
View file @
e7b8705b
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_tensor_op.cuh"
#include "threadwise_tensor_op.cuh"
#include "gemm.cuh"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
HoPerBlock
,
unsigned
WoPerBlock
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_nchw_srck
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
constexpr
unsigned
NPerThread
=
NPerBlock
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_srck_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_nkhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
S
=
wei_srck_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
R
=
wei_srck_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
R
-
1
;
// divide block work: NCHW
constexpr
unsigned
NBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
HBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
WBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
unsigned
itmp
=
get_block_1d_id
();
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
HBlockWork
*
WBlockWork
);
itmp
-=
k_block_work_id
*
(
HBlockWork
*
WBlockWork
);
const
unsigned
h_block_work_id
=
itmp
/
WBlockWork
;
const
unsigned
w_block_work_id
=
itmp
-
h_block_work_id
*
WBlockWork
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
ho_block_data_begin
=
h_block_work_id
*
HoPerBlock
;
const
unsigned
wo_block_data_begin
=
w_block_work_id
*
HoPerBlock
;
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// tensor view of un-reorderd blockwise input and weight (imaginary)
constexpr
auto
in_nchw_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{});
constexpr
auto
wei_srck_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
CPerBlock
,
KPerBlock
>
{});
// tensor view of reordered blockwise input and weight in LDS
constexpr
auto
reorder_chwn_from_nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
constexpr
auto
in_chwn_block_desc
=
make_ConstantTensorDescriptor
(
in_nchw_block_desc
.
GetLengths
().
ReorderByGetNewFromOld
(
reorder_chwn_from_nchw
));
// tensor view of threadwise output in register
constexpr
auto
out_hkwn_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
HoPerThread
,
KPerThread
,
WoPerThread
,
NPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_nchw_block_desc, "in_nchw_block_desc");
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_kcsr_block_desc, "wei_kcsr_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
auto
f_accum
=
[](
auto
&
c
,
auto
&
ab
)
{
c
+=
ab
;
};
const
auto
blockwise_batch_gemm
=
blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
true
,
false
,
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I1
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
decltype
(
f_accum
)
>
{};
// LDS
constexpr
unsigned
in_block_size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_srck_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
// register
Float
p_out_thread
[
out_hkwn_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
in_nchw_global_desc
.
GetLength
(
I1
);
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
#if 1
// input: global mem to LDS,
// convert 4d-tensor in[N,C,Hi,Wi] to matrix in_matrix[C,Hi*Wi*N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
in_chwn_block_desc
,
p_in_block
,
in_nchw_block_desc
.
GetLengths
(),
reorder_chwn_from_nchw
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_4d_tensor_copy
<
BlockSize
>
(
wei_srck_global_desc
,
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
wei_srck_block_desc
,
p_wei_block
,
wei_srck_block_desc
.
GetLengths
());
#endif
__syncthreads
();
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
blockwise_batch_gemm
.
run
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
0
,
r
,
0
),
p_out_thread
);
}
}
}
const
auto
matrix_c_index
=
blockwise_batch_gemm
.
CalculateThreadMatrixCIndex
(
get_thread_local_1d_id
());
const
unsigned
ho_thread_data_begin
=
matrix_c_index
.
batch_begin
;
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
col_begin
;
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
row_begin
/
NPerThread
;
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo]
constexpr
auto
reorder_nkhw_from_hkwn
=
Sequence
<
3
,
1
,
0
,
2
>
{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_nkhw_from_hkwn
);
}
src/include/threadwise_direct_convolution.cuh
View file @
e7b8705b
...
@@ -101,10 +101,10 @@ __device__ void threadwise_direct_convolution_2(InDesc,
...
@@ -101,10 +101,10 @@ __device__ void threadwise_direct_convolution_2(InDesc,
Float
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
Float
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
// copy input tensor into register
// copy input tensor into register
threadwise_4d_tensor_copy
(
in_desc
,
p_in
,
in_reg_desc
,
p_in_reg
,
in_reg_desc
);
threadwise_4d_tensor_copy
(
in_desc
,
p_in
,
in_reg_desc
,
p_in_reg
,
in_reg_desc
.
GetLengths
()
);
// copy input tensor into register
// copy input tensor into register
threadwise_4d_tensor_copy
(
wei_desc
,
p_wei
,
wei_reg_desc
,
p_wei_reg
,
wei_reg_desc
);
threadwise_4d_tensor_copy
(
wei_desc
,
p_wei
,
wei_reg_desc
,
p_wei_reg
,
wei_reg_desc
.
GetLengths
()
);
// do convolution
// do convolution
threadwise_direct_convolution_1
(
threadwise_direct_convolution_1
(
...
@@ -159,14 +159,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -159,14 +159,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
p_in + in_desc.Get1dIndex(0, 0, s, 0),
p_in + in_desc.Get1dIndex(0, 0, s, 0),
in_reg_desc,
in_reg_desc,
p_in_reg,
p_in_reg,
in_reg_desc);
in_reg_desc
.GetLengths()
);
// read first 1x1 weight
// read first 1x1 weight
threadwise_4d_tensor_copy(wei_desc,
threadwise_4d_tensor_copy(wei_desc,
p_wei + wei_desc.Get1dIndex(0, 0, s, 0),
p_wei + wei_desc.Get1dIndex(0, 0, s, 0),
wei_reg_desc,
wei_reg_desc,
p_wei_reg,
p_wei_reg,
wei_reg_desc);
wei_reg_desc
.GetLengths()
);
// do first 1x1 conv
// do first 1x1 conv
threadwise_direct_convolution_1(
threadwise_direct_convolution_1(
...
@@ -180,7 +180,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -180,7 +180,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
p_wei + wei_desc.Get1dIndex(0, 0, s, r),
p_wei + wei_desc.Get1dIndex(0, 0, s, r),
wei_reg_desc,
wei_reg_desc,
p_wei_reg,
p_wei_reg,
wei_reg_desc);
wei_reg_desc
.GetLengths()
);
// shift old input to the left
// shift old input to the left
threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});
threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});
...
@@ -192,7 +192,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -192,7 +192,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
in_reg_desc,
in_reg_desc,
p_in_reg +
p_in_reg +
in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
in_desc_reg_new_read);
in_desc_reg_new_read
.GetLengths()
);
// do 1x1 conv
// do 1x1 conv
threadwise_direct_convolution_1(
threadwise_direct_convolution_1(
...
@@ -211,11 +211,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -211,11 +211,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
p_wei
+
wei_desc
.
Get1dIndex
(
0
,
0
,
s
,
r
),
p_wei
+
wei_desc
.
Get1dIndex
(
0
,
0
,
s
,
r
),
wei_reg_desc
,
wei_reg_desc
,
p_wei_reg
,
p_wei_reg
,
wei_reg_desc
);
wei_reg_desc
.
GetLengths
()
);
// read new input
// read new input
threadwise_4d_tensor_copy
(
threadwise_4d_tensor_copy
(
in_desc
,
in_desc
,
p_in
+
in_desc
.
Get1dIndex
(
0
,
0
,
s
,
r
),
in_reg_desc
,
p_in_reg
,
in_reg_desc
);
p_in
+
in_desc
.
Get1dIndex
(
0
,
0
,
s
,
r
),
in_reg_desc
,
p_in_reg
,
in_reg_desc
.
GetLengths
());
// do 1x1 conv
// do 1x1 conv
threadwise_direct_convolution_1
(
threadwise_direct_convolution_1
(
...
...
src/include/threadwise_tensor_op.cuh
View file @
e7b8705b
...
@@ -37,14 +37,19 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
...
@@ -37,14 +37,19 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
// TODO: in order to optimize mem access for different mem type,
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
// need to write specialized version
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
,
class
F
>
template
<
class
Float
,
__device__
void
class
SrcDesc
,
threadwise_4d_tensor_pointwise_operation_binary_reorder
(
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
DstFromSrcReorder
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Float
*
__restrict__
p_dst
,
RefDesc
,
SrcOpLengths
,
Reorder
,
DstFromSrc
Reorder
,
F
f
)
F
f
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
...
@@ -52,14 +57,14 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -52,14 +57,14 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
unsigned
I
T
0
=
Reorder
{}.
Get
(
I0
);
constexpr
unsigned
I
R
0
=
DstFromSrc
Reorder
{}.
Get
(
I0
);
constexpr
unsigned
I
T
1
=
Reorder
{}.
Get
(
I1
);
constexpr
unsigned
I
R
1
=
DstFromSrc
Reorder
{}.
Get
(
I1
);
constexpr
unsigned
I
T
2
=
Reorder
{}.
Get
(
I2
);
constexpr
unsigned
I
R
2
=
DstFromSrc
Reorder
{}.
Get
(
I2
);
constexpr
unsigned
I
T
3
=
Reorder
{}.
Get
(
I3
);
constexpr
unsigned
I
R
3
=
DstFromSrc
Reorder
{}.
Get
(
I3
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
RefDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}
)
;
for
(
unsigned
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
for
(
unsigned
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
{
{
...
@@ -74,7 +79,7 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
...
@@ -74,7 +79,7 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
const
unsigned
did
[
4
]
=
{
did0
,
did1
,
did2
,
did3
};
const
unsigned
did
[
4
]
=
{
did0
,
did1
,
did2
,
did3
};
const
unsigned
bindex
=
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
I
T
0
],
did
[
I
T
1
],
did
[
I
T
2
],
did
[
I
T
3
]);
dst_desc
.
Get1dIndex
(
did
[
I
R
0
],
did
[
I
R
1
],
did
[
I
R
2
],
did
[
I
R
3
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
...
@@ -92,29 +97,29 @@ __device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
...
@@ -92,29 +97,29 @@ __device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
Desc
{},
p
,
f_set_zero
);
Desc
{},
p
,
f_set_zero
);
}
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
,
class
Reorder
>
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
DstFromSrcReorder
>
__device__
void
threadwise_4d_tensor_copy_reorder
(
__device__
void
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
,
Reorder
)
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
DstFromSrcReorder
)
{
{
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
threadwise_4d_tensor_pointwise_operation_binary_reorder
<
Float
,
threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
,
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
DstFromSrcReorder
{},
f_copy
);
DstDesc
,
RefDesc
,
Reorder
,
decltype
(
f_copy
)
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
Reorder
{},
f_copy
);
}
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
RefDesc
>
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
__device__
void
threadwise_4d_tensor_copy
(
__device__
void
threadwise_4d_tensor_copy
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
RefDesc
)
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
)
{
{
auto
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
auto
dst_from_src_
reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
threadwise_4d_tensor_copy_reorder
<
Float
,
SrcDesc
,
DstDesc
,
RefDesc
,
decltype
(
reorder
)
>
(
threadwise_4d_tensor_copy_reorder
_by_get_dst_from_src
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
RefDesc
{},
reorder
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
dst_from_src_
reorder
);
}
}
template
<
class
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
template
<
class
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment