Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
5fd40ad7
Commit
5fd40ad7
authored
Mar 02, 2019
by
Chao Liu
Browse files
clean up
parent
4543d17a
Changes
22
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
363 additions
and
2103 deletions
+363
-2103
driver/device_direct_convolution_1.hpp
driver/device_direct_convolution_1.hpp
+17
-18
driver/device_direct_convolution_2.hpp
driver/device_direct_convolution_2.hpp
+17
-33
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp
+4
-4
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp
...ice_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp
+24
-30
driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp
driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp
+0
-85
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp
+0
-139
driver/device_implicit_gemm_convolution_2_chwn_csrk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_csrk_khwn.hpp
+39
-43
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp
+0
-264
driver/driver.hip.cpp
driver/driver.hip.cpp
+145
-139
src/include/blockwise_direct_convolution.hip.hpp
src/include/blockwise_direct_convolution.hip.hpp
+12
-14
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+8
-8
src/include/gridwise_direct_convolution_1.hip.hpp
src/include/gridwise_direct_convolution_1.hip.hpp
+25
-59
src/include/gridwise_direct_convolution_2.hip.hpp
src/include/gridwise_direct_convolution_2.hip.hpp
+36
-60
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
+11
-11
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
+13
-13
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
..._convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
+0
-348
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
+0
-270
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
+0
-226
src/include/gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_buffer.hip.hpp
...mm_convolution_2_chwn_csrk_khwn_lds_double_buffer.hip.hpp
+12
-12
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
...idwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
+0
-327
No files found.
driver/device_direct_convolution_1.hpp
View file @
5fd40ad7
...
...
@@ -34,25 +34,24 @@ void device_direct_convolution_1(InDesc,
#if 1
// 3x3, 34x34
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
16
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
YPerBlock
=
2
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
16
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#endif
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
(
OutTileSizeH
*
YPerBlock
))
*
(
out_desc
.
GetLength
(
I3
)
/
(
OutTileSizeW
*
XPerBlock
));
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
HoPerBlock
)
*
(
out_desc
.
GetLength
(
I3
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
...
...
@@ -62,16 +61,16 @@ void device_direct_convolution_1(InDesc,
InDesc
,
WeiDesc
,
OutDesc
,
OutTileSizeH
,
OutTileSizeW
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
Y
PerBlock
,
X
PerBlock
,
Ho
PerBlock
,
Wo
PerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
,
BlockSize
,
GridSize
>
,
dim3
(
GridSize
),
...
...
driver/device_direct_convolution_2.hpp
View file @
5fd40ad7
...
...
@@ -34,40 +34,24 @@ void device_direct_convolution_2(InDesc,
#if 1
// 3x3, 34x34, 128 thread
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
YPerBlock
=
1
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
// 3x3, 34x34, 256 thread
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
YPerBlock
=
1
;
constexpr
unsigned
XPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
BlockSize
=
256
;
#endif
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
(
OutTileSizeH
*
YPerBlock
))
*
(
out_desc
.
GetLength
(
I3
)
/
(
OutTileSizeW
*
XPerBlock
));
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
HoPerBlock
)
*
(
out_desc
.
GetLength
(
I3
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
...
...
@@ -77,16 +61,16 @@ void device_direct_convolution_2(InDesc,
InDesc
,
WeiDesc
,
OutDesc
,
OutTileSizeH
,
OutTileSizeW
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
Y
PerBlock
,
X
PerBlock
,
Ho
PerBlock
,
Wo
PerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
,
BlockSize
,
GridSize
>
,
dim3
(
GridSize
),
...
...
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp
View file @
5fd40ad7
...
...
@@ -30,11 +30,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
unsigned
K
=
wei_kcsr_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcsr_desc
.
GetLength
(
I1
);
constexpr
unsigned
S
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_kcsr_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcsr_desc
.
GetLength
(
I3
);
// reorder weight
auto
wei_csrk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
S
,
R
,
K
>
{});
auto
wei_csrk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_csrk_desc
,
std
::
cout
<<
"wei_csrk_desc: "
);
Tensor
<
T
>
wei_csrk
(
make_TensorDescriptor
(
wei_csrk_desc
));
...
...
@@ -43,7 +43,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
wei_csrk
(
c
,
s
,
r
,
k
)
=
wei_kcsr
(
k
,
c
,
s
,
r
);
};
make_ParallelTensorFunctor
(
f_reorder_kcsr2csrk
,
K
,
C
,
S
,
R
)(
make_ParallelTensorFunctor
(
f_reorder_kcsr2csrk
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
// reorder input
...
...
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp
View file @
5fd40ad7
...
...
@@ -2,7 +2,6 @@
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
void
device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
(
InDesc
,
...
...
@@ -33,11 +32,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr
unsigned
K
=
wei_kcsr_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcsr_desc
.
GetLength
(
I1
);
constexpr
unsigned
S
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_kcsr_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcsr_desc
.
GetLength
(
I3
);
// reorder weight
auto
wei_csrk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
S
,
R
,
K
>
{});
auto
wei_csrk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_csrk_desc
,
std
::
cout
<<
"wei_csrk_desc: "
);
Tensor
<
T
>
wei_csrk
(
make_TensorDescriptor
(
wei_csrk_desc
));
...
...
@@ -46,7 +45,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
wei_csrk
(
c
,
s
,
r
,
k
)
=
wei_kcsr
(
k
,
c
,
s
,
r
);
};
make_ParallelTensorFunctor
(
f_reorder_kcsr2csrk
,
K
,
C
,
S
,
R
)(
make_ParallelTensorFunctor
(
f_reorder_kcsr2csrk
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
// reorder input
...
...
@@ -251,31 +250,26 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
#if 0
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
#elif
1
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline
#endif
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_chwn_desc
),
decltype
(
wei_csrk_desc
),
decltype
(
out_khwn_desc
),
LowerPads
,
UpperPads
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
,
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_chwn_desc
),
decltype
(
wei_csrk_desc
),
decltype
(
out_khwn_desc
),
LowerPads
,
UpperPads
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
...
...
driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp
deleted
100644 → 0
View file @
4543d17a
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_1_nchw_kcsr_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
,
unsigned
nrepeat
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
#if 1
// 3x3, 34x34
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#endif
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
HoPerBlock
)
*
(
out_desc
.
GetLength
(
I3
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw
<
GridSize
,
BlockSize
,
T
,
InDesc
,
WeiDesc
,
OutDesc
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp
deleted
100644 → 0
View file @
4543d17a
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_1_nchw_srck_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcsr
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcsr_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kcsr_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcsr_desc
.
GetLength
(
I1
);
constexpr
unsigned
S
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_kcsr_desc
.
GetLength
(
I3
);
auto
wei_srck_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
C
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_srck_desc
,
std
::
cout
<<
"wei_srck_desc: "
);
Tensor
<
T
>
wei_srck
(
make_TensorDescriptor
(
wei_srck_desc
));
auto
f_reorder_kcsr2srck
=
[
&
](
auto
k
,
auto
c
,
auto
s
,
auto
r
)
{
wei_srck
(
s
,
r
,
c
,
k
)
=
wei_kcsr
(
k
,
c
,
s
,
r
);
};
make_ParallelTensorFunctor
(
f_reorder_kcsr2srck
,
K
,
C
,
S
,
R
)(
std
::
thread
::
hardware_concurrency
());
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_srck_device_buf
(
data_sz
*
wei_srck
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_srck_device_buf
.
ToDevice
(
wei_srck
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
// for 3x3, 34x34
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
1
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
// for 3x3, 58x58
constexpr
unsigned
NPerBlock
=
4
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
8
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// for 3x3, 56x56
constexpr
unsigned
NPerBlock
=
32
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
2
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#endif
constexpr
unsigned
GridSize
=
((
N
+
NPerBlock
-
1
)
/
NPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
)
*
((
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
)
*
((
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_srck_desc
),
decltype
(
out_nkhw_desc
),
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_srck_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/device_implicit_gemm_convolution_2_chwn_csrk_khwn.hpp
View file @
5fd40ad7
...
...
@@ -30,10 +30,10 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
constexpr
unsigned
K
=
wei_kcsr_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcsr_desc
.
GetLength
(
I1
);
constexpr
unsigned
S
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_kcsr_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcsr_desc
.
GetLength
(
I3
);
constexpr
unsigned
BGhostRead
=
(
S
-
1
)
*
Wi
+
(
R
-
1
);
constexpr
unsigned
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
// convert in_nchw to in_cnhw
auto
in_chwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
...
...
@@ -49,7 +49,7 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
Wi
)(
std
::
thread
::
hardware_concurrency
());
// convert wei_kcsr to wei_csrk
auto
wei_csrk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
S
,
R
,
K
>
{});
auto
wei_csrk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_csrk_desc
,
std
::
cout
<<
"wei_csrk_desc: "
);
Tensor
<
T
>
wei_csrk
(
make_TensorDescriptor
(
wei_csrk_desc
));
...
...
@@ -58,8 +58,8 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
[
&
](
auto
k
,
auto
c
,
auto
s
,
auto
r
)
{
wei_csrk
(
c
,
s
,
r
,
k
)
=
wei_kcsr
(
k
,
c
,
s
,
r
);
},
K
,
C
,
S
,
R
)(
std
::
thread
::
hardware_concurrency
());
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
// conver out_nkhw to out_knhw
auto
out_khwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
...
...
@@ -209,43 +209,39 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
#if 0
gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn
#else
gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_buffer
#endif
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_chwn_desc
),
decltype
(
wei_csrk_desc
),
decltype
(
out_khwn_desc
),
BPerBlock
,
KPerBlock
,
CPerBlock
,
BPerThread
,
KPerThread
,
GemmThreadPerColumnPerCluster
,
GemmThreadPerRowPerCluster
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
float
time
=
launch_kernel
(
gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_buffer
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_chwn_desc
),
decltype
(
wei_csrk_desc
),
decltype
(
out_khwn_desc
),
BPerBlock
,
KPerBlock
,
CPerBlock
,
BPerThread
,
KPerThread
,
GemmThreadPerColumnPerCluster
,
GemmThreadPerRowPerCluster
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
...
...
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp
deleted
100644 → 0
View file @
4543d17a
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_2_cnhw_csrk_knhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcsr
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcsr_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
N
=
in_nchw_desc
.
GetLength
(
I0
);
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kcsr_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcsr_desc
.
GetLength
(
I1
);
constexpr
unsigned
S
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_kcsr_desc
.
GetLength
(
I3
);
constexpr
unsigned
BGhostRead
=
(
S
-
1
)
*
Wi
+
(
R
-
1
);
// convert in_nchw to in_cnhw
auto
in_cnhw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
N
,
Hi
,
Wi
>
{});
ostream_ConstantTensorDescriptor
(
in_cnhw_desc
,
std
::
cout
<<
"in_cnhw_desc: "
);
Tensor
<
T
>
in_cnhw
(
make_TensorDescriptor
(
in_cnhw_desc
));
auto
f_reorder_nchw2cnhw
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
in_cnhw
(
c
,
n
,
hi
,
wi
)
=
in_nchw
(
n
,
c
,
hi
,
wi
);
};
make_ParallelTensorFunctor
(
f_reorder_nchw2cnhw
,
N
,
C
,
Hi
,
Wi
)(
std
::
thread
::
hardware_concurrency
());
// convert wei_kcsr to wei_csrk
auto
wei_csrk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
S
,
R
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_csrk_desc
,
std
::
cout
<<
"wei_csrk_desc: "
);
Tensor
<
T
>
wei_csrk
(
make_TensorDescriptor
(
wei_csrk_desc
));
auto
f_reorder_kcsr2csrk
=
[
&
](
auto
k
,
auto
c
,
auto
s
,
auto
r
)
{
wei_csrk
(
c
,
s
,
r
,
k
)
=
wei_kcsr
(
k
,
c
,
s
,
r
);
};
make_ParallelTensorFunctor
(
f_reorder_kcsr2csrk
,
K
,
C
,
S
,
R
)(
std
::
thread
::
hardware_concurrency
());
// conver out_nkhw to out_knhw
auto
out_knhw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
N
,
Ho
,
Wo
>
{});
ostream_ConstantTensorDescriptor
(
out_knhw_desc
,
std
::
cout
<<
"out_knhw_desc: "
);
Tensor
<
T
>
out_knhw
(
make_TensorDescriptor
(
out_knhw_desc
));
#if 0
// 3x3, 34x34
// need to use register double buffer for GEMM
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned BPerThread = 8;
constexpr unsigned KPerThread = 8;
constexpr unsigned GemmMPerThreadSubC = 4;
constexpr unsigned GemmNPerThreadSubC = 4;
constexpr unsigned GemmMLevel0Cluster = 4;
constexpr unsigned GemmNLevel0Cluster = 2;
constexpr unsigned GemmMLevel1Cluster = 2;
constexpr unsigned GemmNLevel1Cluster = 8;
constexpr unsigned GemmKPerThreadLoop = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 8;
constexpr unsigned GemmThreadPerRowPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128;
#elif
0
// 1x1, 28x28, 64 threads
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
BPerThread
=
8
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
GemmMPerThreadSubC
=
4
;
constexpr
unsigned
GemmNPerThreadSubC
=
4
;
constexpr
unsigned
GemmMLevel0Cluster
=
4
;
constexpr
unsigned
GemmNLevel0Cluster
=
2
;
constexpr
unsigned
GemmMLevel1Cluster
=
2
;
constexpr
unsigned
GemmNLevel1Cluster
=
4
;
constexpr
unsigned
GemmKPerThreadLoop
=
1
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
8
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
64
;
#elif 1
// 1x1, 28x28, 128 threads, no lds-double-buffer
// 1x1, 28x28, 128 threads, with lds-double-buffer, max_register = 128
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
BPerThread
=
8
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
GemmMPerThreadSubC
=
4
;
constexpr
unsigned
GemmNPerThreadSubC
=
4
;
constexpr
unsigned
GemmMLevel0Cluster
=
4
;
constexpr
unsigned
GemmNLevel0Cluster
=
2
;
constexpr
unsigned
GemmMLevel1Cluster
=
4
;
constexpr
unsigned
GemmNLevel1Cluster
=
4
;
constexpr
unsigned
GemmKPerThreadLoop
=
1
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
8
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// 1x1, 28x28, 256 thread
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
BPerThread
=
8
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
GemmMPerThreadSubC
=
4
;
constexpr
unsigned
GemmNPerThreadSubC
=
4
;
constexpr
unsigned
GemmMLevel0Cluster
=
4
;
constexpr
unsigned
GemmNLevel0Cluster
=
4
;
constexpr
unsigned
GemmMLevel1Cluster
=
4
;
constexpr
unsigned
GemmNLevel1Cluster
=
4
;
constexpr
unsigned
GemmKPerThreadLoop
=
1
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
8
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
256
;
#endif
constexpr
unsigned
GridSize
=
((
N
*
Hi
*
Wi
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
// mem
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_cnhw_device_buf
(
data_sz
*
(
in_cnhw
.
mDesc
.
GetElementSpace
()
+
BGhostRead
+
BPerBlock
));
// reserve extra space for BGhostRead
DeviceMem
wei_csrk_device_buf
(
data_sz
*
wei_csrk
.
mDesc
.
GetElementSpace
());
DeviceMem
out_knhw_device_buf
(
data_sz
*
out_knhw
.
mDesc
.
GetElementSpace
());
in_cnhw_device_buf
.
ToDevice
(
in_cnhw
.
mData
.
data
());
wei_csrk_device_buf
.
ToDevice
(
wei_csrk
.
mData
.
data
());
out_knhw_device_buf
.
ToDevice
(
out_knhw
.
mData
.
data
());
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
#if 0
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
#else
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer
#endif
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_cnhw_desc
),
decltype
(
wei_csrk_desc
),
decltype
(
out_knhw_desc
),
BPerBlock
,
KPerBlock
,
CPerBlock
,
BPerThread
,
KPerThread
,
GemmThreadPerColumnPerCluster
,
GemmThreadPerRowPerCluster
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_csrk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_knhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_knhw_device_buf
.
FromDevice
(
out_knhw
.
mData
.
data
());
// convert out_knhw to out_nkhw
auto
f_reorder_knhw2nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_knhw
(
k
,
n
,
ho
,
wo
);
};
make_ParallelTensorFunctor
(
f_reorder_knhw2nkhw
,
N
,
K
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
}
driver/driver.hip.cpp
View file @
5fd40ad7
This diff is collapsed.
Click to expand it.
src/include/blockwise_direct_convolution.hip.hpp
View file @
5fd40ad7
...
...
@@ -8,11 +8,11 @@ template <unsigned BlockSize,
class
InBlockDesc
,
class
WeiBlockDesc
,
class
OutBlockDesc
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
>
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
__device__
void
blockwise_direct_convolution
(
InBlockDesc
,
Float
*
const
__restrict__
p_in_block
,
WeiBlockDesc
,
...
...
@@ -29,19 +29,17 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
constexpr
auto
wei_block_desc
=
WeiBlockDesc
{};
constexpr
auto
out_block_desc
=
OutBlockDesc
{};
constexpr
unsigned
S
=
wei_block_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_block_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_block_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_block_desc
.
GetLength
(
I3
);
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
InTileSizeW
=
OutTileSizeW
+
R
-
1
;
constexpr
unsigned
InTileSizeH
=
HoPerThread
+
Y
-
1
;
constexpr
unsigned
InTileSizeW
=
WoPerThread
+
X
-
1
;
// divide thread work
constexpr
unsigned
NThreadWork
=
(
out_block_desc
.
GetLength
(
I0
)
+
NPerThread
-
1
)
/
NPerThread
;
constexpr
unsigned
KThreadWork
=
(
out_block_desc
.
GetLength
(
I1
)
+
KPerThread
-
1
)
/
KPerThread
;
constexpr
unsigned
YThreadWork
=
(
out_block_desc
.
GetLength
(
I2
)
+
OutTileSizeH
-
1
)
/
OutTileSizeH
;
constexpr
unsigned
XThreadWork
=
(
out_block_desc
.
GetLength
(
I3
)
+
OutTileSizeW
-
1
)
/
OutTileSizeW
;
constexpr
unsigned
YThreadWork
=
(
out_block_desc
.
GetLength
(
I2
)
+
HoPerThread
-
1
)
/
HoPerThread
;
constexpr
unsigned
XThreadWork
=
(
out_block_desc
.
GetLength
(
I3
)
+
WoPerThread
-
1
)
/
WoPerThread
;
#if 0
if(threadIdx.x == 0)
...
...
@@ -56,7 +54,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
InTileSizeH
,
InTileSizeW
>
{});
constexpr
auto
wei_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{});
constexpr
auto
out_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_thread_desc
,
wei_thread_desc
);
...
...
@@ -86,8 +84,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
unsigned
n_thread_data_begin
=
n_thread_work_id
*
NPerThread
;
unsigned
k_thread_data_begin
=
k_thread_work_id
*
KPerThread
;
unsigned
ho_thread_data_begin
=
y_thread_work_id
*
OutTileSizeH
;
unsigned
wo_thread_data_begin
=
x_thread_work_id
*
OutTileSizeW
;
unsigned
ho_thread_data_begin
=
y_thread_work_id
*
HoPerThread
;
unsigned
wo_thread_data_begin
=
x_thread_work_id
*
WoPerThread
;
unsigned
hi_thread_data_begin
=
ho_thread_data_begin
;
// minus padding
unsigned
wi_thread_data_begin
=
wo_thread_data_begin
;
// minus padding
...
...
src/include/conv_common.hip.hpp
View file @
5fd40ad7
...
...
@@ -24,11 +24,11 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr
auto
WI
=
in_desc
.
GetLength
(
I3
);
constexpr
auto
K
=
wei_desc
.
GetLength
(
I0
);
constexpr
auto
S
=
wei_desc
.
GetLength
(
I2
);
constexpr
auto
R
=
wei_desc
.
GetLength
(
I3
);
constexpr
auto
Y
=
wei_desc
.
GetLength
(
I2
);
constexpr
auto
X
=
wei_desc
.
GetLength
(
I3
);
constexpr
auto
HO
=
HI
+
1
-
S
;
constexpr
auto
WO
=
WI
+
1
-
R
;
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
1
-
X
;
return
make_ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
...
...
@@ -55,8 +55,8 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr
auto
WI
=
in_desc
.
GetLength
(
I3
);
constexpr
auto
K
=
wei_desc
.
GetLength
(
I0
);
constexpr
auto
S
=
wei_desc
.
GetLength
(
I2
);
constexpr
auto
R
=
wei_desc
.
GetLength
(
I3
);
constexpr
auto
Y
=
wei_desc
.
GetLength
(
I2
);
constexpr
auto
X
=
wei_desc
.
GetLength
(
I3
);
constexpr
auto
HPadLow
=
LowerPads
{}.
Get
(
I0
);
constexpr
auto
WPadLow
=
LowerPads
{}.
Get
(
I1
);
...
...
@@ -64,8 +64,8 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr
auto
HPadUp
=
UpperPads
{}.
Get
(
I0
);
constexpr
auto
WPadUp
=
UpperPads
{}.
Get
(
I1
);
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
S
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
R
;
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
X
;
return
make_ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
src/include/gridwise_direct_convolution_1.hip.hpp
View file @
5fd40ad7
...
...
@@ -8,16 +8,16 @@ template <class Float,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
Y
PerBlock
,
unsigned
X
PerBlock
,
unsigned
Ho
PerBlock
,
unsigned
Wo
PerBlock
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
,
unsigned
BlockSize
,
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_1
(
const
Float
*
const
__restrict__
p_in_global
,
...
...
@@ -33,25 +33,22 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
S
=
wei_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
HoPerBlock
=
OutTileSizeH
*
YPerBlock
;
constexpr
unsigned
WoPerBlock
=
OutTileSizeW
*
XPerBlock
;
constexpr
unsigned
HiPerBlock
=
YPerBlock
*
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
XPerBlock
*
OutTileSizeW
+
R
-
1
;
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
unsigned
NBlockWork
=
(
out_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
Y
BlockWork
=
(
out_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
X
BlockWork
=
(
out_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
constexpr
unsigned
H
BlockWork
=
(
out_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
W
BlockWork
=
(
out_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
constexpr
auto
in_block_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{},
in_global_desc
.
GetStrides
());
constexpr
auto
wei_block_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
S
,
R
>
{},
wei_global_desc
.
GetStrides
());
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{},
wei_global_desc
.
GetStrides
());
constexpr
auto
out_block_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
>
{},
out_global_desc
.
GetStrides
());
...
...
@@ -73,52 +70,21 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
const
unsigned
block_id
=
blockIdx
.
x
;
unsigned
itmp
=
block_id
;
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
unsigned
k_block_work_id
=
itmp
/
(
YBlockWork
*
XBlockWork
);
itmp
-=
k_block_work_id
*
(
YBlockWork
*
XBlockWork
);
unsigned
y_block_work_id
=
itmp
/
XBlockWork
;
unsigned
x_block_work_id
=
itmp
-
y_block_work_id
*
XBlockWork
;
unsigned
n_block_work_begin
=
n_block_work_id
*
NPerBlock
;
unsigned
k_block_work_begin
=
k_block_work_id
*
KPerBlock
;
unsigned
y_block_work_begin
=
y_block_work_id
*
YPerBlock
;
unsigned
x_block_work_begin
=
x_block_work_id
*
XPerBlock
;
unsigned
ho_block_work_begin
=
y_block_work_begin
*
OutTileSizeH
;
unsigned
wo_block_work_begin
=
x_block_work_begin
*
OutTileSizeW
;
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
unsigned
k_block_work_id
=
itmp
/
(
HBlockWork
*
WBlockWork
);
itmp
-=
k_block_work_id
*
(
HBlockWork
*
WBlockWork
);
unsigned
h_block_work_id
=
itmp
/
WBlockWork
;
unsigned
w_block_work_id
=
itmp
-
h_block_work_id
*
WBlockWork
;
unsigned
n_block_work_begin
=
n_block_work_id
*
NPerBlock
;
unsigned
k_block_work_begin
=
k_block_work_id
*
KPerBlock
;
unsigned
ho_block_work_begin
=
h_block_work_id
*
HoPerBlock
;
unsigned
wo_block_work_begin
=
w_block_work_id
*
WoPerBlock
;
unsigned
hi_block_work_begin
=
ho_block_work_begin
;
// minus padding
unsigned
wi_block_work_begin
=
wo_block_work_begin
;
// minus padding
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor( in_global_desc, "gridwise_convolution: in_global_desc: ");
print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
print_ConstantTensorDescriptor( in_block_global_desc, "gridwise_convolution: in_block_global_desc: ");
print_ConstantTensorDescriptor(wei_block_global_desc, "gridwise_convolution: wei_block_global_desc: ");
print_ConstantTensorDescriptor(out_block_global_desc, "gridwise_convolution: out_block_global_desc: ");
print_ConstantTensorDescriptor( in_block_desc, "gridwise_convolution: in_block_desc: ");
print_ConstantTensorDescriptor(wei_block_desc, "gridwise_convolution: wei_block_desc: ");
print_ConstantTensorDescriptor(out_block_desc, "gridwise_convolution: out_block_desc: ");
printf("NBlockWork %u, KBlockWork %u, YBlockWork %u, XBlockWork %u \t"
"block_id %u, n_block_work_id %u, k_block_work_id %u, y_block_work_id %u, "
"x_block_work_id %u\n",
NBlockWork,
KBlockWork,
YBlockWork,
XBlockWork,
block_id,
n_block_work_id,
k_block_work_id,
y_block_work_id,
x_block_work_id);
}
#endif
constexpr
auto
blockwise_in_copy
=
Blockwise4dTensorCopy1
<
BlockSize
,
Float
,
...
...
@@ -166,11 +132,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
decltype
(
in_block_desc
),
decltype
(
wei_block_desc
),
decltype
(
out_block_desc
),
OutTileSizeH
,
OutTileSizeW
,
NPerThread
,
KPerThread
,
CPerThread
>
(
CPerThread
,
HoPerThread
,
WoPerThread
>
(
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
__syncthreads
();
...
...
src/include/gridwise_direct_convolution_2.hip.hpp
View file @
5fd40ad7
...
...
@@ -10,16 +10,16 @@ template <class Float,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
Y
PerBlock
,
unsigned
X
PerBlock
,
unsigned
Ho
PerBlock
,
unsigned
Wo
PerBlock
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
,
unsigned
BlockSize
,
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2
(
const
Float
*
const
__restrict__
p_in_global
,
...
...
@@ -35,20 +35,17 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
S
=
wei_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
HoPerBlock
=
OutTileSizeH
*
YPerBlock
;
constexpr
unsigned
WoPerBlock
=
OutTileSizeW
*
XPerBlock
;
constexpr
unsigned
HiPerBlock
=
YPerBlock
*
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
XPerBlock
*
OutTileSizeW
+
R
-
1
;
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
auto
in_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{});
constexpr
auto
wei_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
S
,
R
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{});
// shared mem
constexpr
unsigned
in_block_size
=
in_block_desc
.
GetElementSpace
();
...
...
@@ -58,14 +55,14 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
__shared__
Float
p_wei_block
[
wei_block_size
];
// threadwise tensors
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
InTileSizeW
=
OutTileSizeW
+
R
-
1
;
constexpr
unsigned
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
unsigned
WiPerThread
=
WoPerThread
+
X
-
1
;
constexpr
auto
in_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
InTileSizeH
,
InTileSizeW
>
{},
in_block_desc
.
GetStrides
());
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
in_block_desc
.
GetStrides
());
constexpr
auto
wei_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
S
,
R
>
{},
wei_block_desc
.
GetStrides
());
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_block_desc
.
GetStrides
());
constexpr
auto
out_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_thread_block_desc
,
wei_thread_block_desc
);
...
...
@@ -76,26 +73,23 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// divide block work
constexpr
unsigned
NBlockWork
=
(
out_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
Y
BlockWork
=
(
out_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
X
BlockWork
=
(
out_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
constexpr
unsigned
H
BlockWork
=
(
out_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
W
BlockWork
=
(
out_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
const
unsigned
block_id
=
blockIdx
.
x
;
unsigned
itmp
=
block_id
;
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
YBlockWork
*
XBlockWork
);
itmp
-=
k_block_work_id
*
(
YBlockWork
*
XBlockWork
);
const
unsigned
y_block_work_id
=
itmp
/
XBlockWork
;
const
unsigned
x_block_work_id
=
itmp
-
y_block_work_id
*
XBlockWork
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
y_block_data_begin
=
y_block_work_id
*
YPerBlock
;
const
unsigned
x_block_data_begin
=
x_block_work_id
*
XPerBlock
;
const
unsigned
ho_block_data_begin
=
y_block_data_begin
*
OutTileSizeH
;
const
unsigned
wo_block_data_begin
=
x_block_data_begin
*
OutTileSizeW
;
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
HBlockWork
*
WBlockWork
);
itmp
-=
k_block_work_id
*
(
HBlockWork
*
WBlockWork
);
const
unsigned
h_block_work_id
=
itmp
/
WBlockWork
;
const
unsigned
w_block_work_id
=
itmp
-
h_block_work_id
*
WBlockWork
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
ho_block_data_begin
=
h_block_work_id
*
HoPerBlock
;
const
unsigned
wo_block_data_begin
=
w_block_work_id
*
WoPerBlock
;
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
// minus padding
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// minus padding
...
...
@@ -103,45 +97,27 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// divide thread work
constexpr
unsigned
NThreadWork
=
(
NPerBlock
+
NPerThread
-
1
)
/
NPerThread
;
constexpr
unsigned
KThreadWork
=
(
KPerBlock
+
KPerThread
-
1
)
/
KPerThread
;
constexpr
unsigned
Y
ThreadWork
=
Y
PerBlock
;
constexpr
unsigned
X
ThreadWork
=
X
PerBlock
;
constexpr
unsigned
H
ThreadWork
=
(
Ho
PerBlock
+
HoPerThread
-
1
)
/
HoPerThread
;
constexpr
unsigned
W
ThreadWork
=
(
Wo
PerBlock
+
WoPerThread
-
1
)
/
WoPerThread
;
const
unsigned
thread_id
=
threadIdx
.
x
;
itmp
=
thread_id
;
const
unsigned
n_thread_work_id
=
itmp
/
(
KThreadWork
*
Y
ThreadWork
*
X
ThreadWork
);
itmp
-=
n_thread_work_id
*
(
KThreadWork
*
Y
ThreadWork
*
X
ThreadWork
);
const
unsigned
k_thread_work_id
=
itmp
/
(
Y
ThreadWork
*
X
ThreadWork
);
itmp
-=
k_thread_work_id
*
(
Y
ThreadWork
*
X
ThreadWork
);
const
unsigned
y
_thread_work_id
=
itmp
/
X
ThreadWork
;
const
unsigned
x
_thread_work_id
=
itmp
-
y
_thread_work_id
*
X
ThreadWork
;
const
unsigned
n_thread_work_id
=
itmp
/
(
KThreadWork
*
H
ThreadWork
*
W
ThreadWork
);
itmp
-=
n_thread_work_id
*
(
KThreadWork
*
H
ThreadWork
*
W
ThreadWork
);
const
unsigned
k_thread_work_id
=
itmp
/
(
H
ThreadWork
*
W
ThreadWork
);
itmp
-=
k_thread_work_id
*
(
H
ThreadWork
*
W
ThreadWork
);
const
unsigned
h
_thread_work_id
=
itmp
/
W
ThreadWork
;
const
unsigned
w
_thread_work_id
=
itmp
-
h
_thread_work_id
*
W
ThreadWork
;
const
unsigned
n_thread_data_begin
=
n_thread_work_id
*
NPerThread
;
const
unsigned
k_thread_data_begin
=
k_thread_work_id
*
KPerThread
;
const
unsigned
ho_thread_data_begin
=
y
_thread_work_id
*
OutTileSizeH
;
const
unsigned
wo_thread_data_begin
=
x
_thread_work_id
*
OutTileSizeW
;
const
unsigned
ho_thread_data_begin
=
h
_thread_work_id
*
HoPerThread
;
const
unsigned
wo_thread_data_begin
=
w
_thread_work_id
*
WoPerThread
;
const
unsigned
hi_thread_data_begin
=
ho_thread_data_begin
;
const
unsigned
wi_thread_data_begin
=
wo_thread_data_begin
;
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution: in_global_desc: ");
print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
}
printf("threadIdx.x %u \t"
"n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
"wo_thread_data_begin %u\n",
threadIdx.x,
n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin);
#endif
constexpr
auto
blockwise_in_copy
=
Blockwise4dTensorCopy1
<
BlockSize
,
Float
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
View file @
5fd40ad7
...
...
@@ -62,11 +62,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
N
=
out_khwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
S
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
R
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
X
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
R
-
1
;
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
// divide block work: [K, Ho, Wo, N]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
...
...
@@ -90,7 +90,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// flattend (2d) tensor view of gridwise weight
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
S
*
R
,
K
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
Y
*
X
,
K
>
{});
// tensor view of blockwise input and weight in LDS
// be careful of alignment
...
...
@@ -98,10 +98,10 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
Sequence
<
CPerBlock
,
HiPerBlock
,
WiPerBlock
,
NPerBlock
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
S
*
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
auto
wei_csrk_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
S
,
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_khwn_thread_desc
=
...
...
@@ -118,7 +118,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
InBlockCopyDataPerRead
>
{};
// blockwise wei copy
// format is [CPerBlock*
S*R
,KPerBlock]
// format is [CPerBlock*
Y*X
,KPerBlock]
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
...
...
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[C,
S,R
,K]
// A_matrix[C,K] is a sub-matrix of wei_block[C,
Y,X
,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
...
...
@@ -204,9 +204,9 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
__syncthreads
();
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
{
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
View file @
5fd40ad7
...
...
@@ -55,8 +55,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
N
=
out_khwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
S
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
R
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
X
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
HPadLow
=
LowerPads
{}.
Get
(
I0
);
constexpr
unsigned
WPadLow
=
LowerPads
{}.
Get
(
I1
);
...
...
@@ -64,8 +64,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr
unsigned
HPadUp
=
UpperPads
{}.
Get
(
I0
);
constexpr
unsigned
WPadUp
=
UpperPads
{}.
Get
(
I1
);
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
R
-
1
;
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
// divide block work: [K, Ho, Wo, N]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
...
...
@@ -86,18 +86,18 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
// flattened (2d) tensor view of wei in global mem
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
S
*
R
,
K
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
Y
*
X
,
K
>
{});
// tensor view of blockwise input and weight in LDS
constexpr
auto
in_chwn_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
HiPerBlock
,
WiPerBlock
,
NPerBlock
>
{});
constexpr
auto
wei_csrk_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
S
,
R
,
KPerBlock
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{});
// flattened (2d) tensor view of wei in LDS
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
*
S
*
R
,
KPerBlock
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_hkwn_thread_desc
=
...
...
@@ -144,7 +144,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
LowerPads
>
{};
#if 0
// weight: format is [C,
S,R
,K]
// weight: format is [C,
Y,X
,K]
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Float,
...
...
@@ -152,7 +152,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{};
#elif
0
// weight: format is [C*
S*R
,K]
// weight: format is [C*
Y*X
,K]
constexpr
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy1
<
BlockSize
,
Float
,
...
...
@@ -160,7 +160,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
())
>
{};
#elif 1
// weight: format is [C*
S*R
,K]
// weight: format is [C*
Y*X
,K]
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
...
...
@@ -173,7 +173,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[C,
S,R
,K]
// A_matrix[C,K] is a sub-matrix of wei_block[C,
Y,X
,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
...
...
@@ -245,9 +245,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
__syncthreads
();
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
deleted
100644 → 0
View file @
4543d17a
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
class
LowerPads
,
class
UpperPads
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
HoPerBlock
,
unsigned
WoPerBlock
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
>
__global__
void
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
static_assert
(
NPerBlock
%
NPerThread
==
0
,
"wrong! NPerBlock % NPerThread !=0"
);
static_assert
((
NPerThread
<
NPerBlock
&&
WoPerThread
==
1
)
||
NPerThread
==
NPerBlock
,
"wrong!"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_csrk_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
K
=
out_khwn_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_khwn_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
N
=
out_khwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
S
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
R
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
HPadLow
=
LowerPads
{}.
Get
(
I0
);
constexpr
unsigned
WPadLow
=
LowerPads
{}.
Get
(
I1
);
constexpr
unsigned
HPadUp
=
UpperPads
{}.
Get
(
I0
);
constexpr
unsigned
WPadUp
=
UpperPads
{}.
Get
(
I1
);
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
R
-
1
;
// divide block work: [K, Ho, Wo, N]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
HBlockWork
=
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
WBlockWork
=
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
;
constexpr
unsigned
NBlockWork
=
(
N
+
NPerBlock
-
1
)
/
NPerBlock
;
const
unsigned
k_block_work_id
=
get_block_1d_id
()
/
(
HBlockWork
*
WBlockWork
*
NBlockWork
);
unsigned
itmp
=
get_block_1d_id
()
-
k_block_work_id
*
(
HBlockWork
*
WBlockWork
*
NBlockWork
);
const
unsigned
h_block_work_id
=
itmp
/
(
WBlockWork
*
NBlockWork
);
itmp
-=
h_block_work_id
*
(
WBlockWork
*
NBlockWork
);
const
unsigned
w_block_work_id
=
itmp
/
NBlockWork
;
const
unsigned
n_block_work_id
=
itmp
-
w_block_work_id
*
NBlockWork
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
ho_block_data_begin
=
h_block_work_id
*
HoPerBlock
;
const
unsigned
wo_block_data_begin
=
w_block_work_id
*
WoPerBlock
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
// flattened (2d) tensor view of wei in global mem
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
S
*
R
,
K
>
{});
// tensor view of blockwise input and weight in LDS
constexpr
auto
in_chwn_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
HiPerBlock
,
WiPerBlock
,
NPerBlock
>
{});
constexpr
auto
wei_csrk_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
S
,
R
,
KPerBlock
>
{});
// flattened (2d) tensor view of wei in LDS
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
*
S
*
R
,
KPerBlock
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_hkwn_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
HoPerThread
,
KPerThread
,
WoPerThread
,
NPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// blockwise copy
// input: format is [C, Hi, Wi, N]
const
unsigned
h_block_pad_low
=
h_block_work_id
==
0
?
HPadLow
:
0
;
const
unsigned
w_block_pad_low
=
w_block_work_id
==
0
?
WPadLow
:
0
;
const
unsigned
h_block_pad_up
=
h_block_work_id
==
HBlockWork
-
1
?
HPadUp
:
0
;
const
unsigned
w_block_pad_up
=
w_block_work_id
==
WBlockWork
-
1
?
WPadUp
:
0
;
#if 0
if(get_thread_local_1d_id() == 0)
;
{
printf(
"%u %u, h_block_pad_low %u w_block_pad_low %u h_block_pad_up %u w_block_pad_up %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
h_block_pad_low,
w_block_pad_low,
h_block_pad_up,
w_block_pad_up);
}
#endif
constexpr
auto
blockwise_in_copy
=
BlockwiseChwnTensorCopyPadded
<
BlockSize
,
Float
,
decltype
(
in_chwn_global_desc
),
decltype
(
in_chwn_block_desc
),
decltype
(
in_chwn_block_desc
.
GetLengths
()),
LowerPads
>
{};
#if 0
// weight: format is [C,S,R,K]
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Float,
decltype(wei_csrk_global_desc),
decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{};
#elif
0
// weight: format is [C*S*R,K]
constexpr
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy1
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
())
>
{};
#elif 1
// weight: format is [C*S*R,K]
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
{};
#endif
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
true
,
false
,
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
true
>
{};
// LDS
constexpr
unsigned
in_block_size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_csrk_block_desc
.
GetElementSpace
();
// LDS double buffer
__shared__
Float
p_in_block_0
[
in_block_size
];
__shared__
Float
p_wei_block_0
[
wei_block_size
];
__shared__
Float
p_in_block_1
[
in_block_size
];
__shared__
Float
p_wei_block_1
[
wei_block_size
];
// register
Float
p_out_thread
[
out_hkwn_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
const
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_ek_global_desc
.
Get1dIndex
(
0
,
k_block_data_begin
);
// prelog: load data
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global
,
0
,
ho_block_data_begin
,
wo_block_data_begin
,
n_block_data_begin
,
p_in_block_0
,
h_block_pad_low
,
w_block_pad_low
,
h_block_pad_up
,
w_block_pad_up
);
// weight: global mem to LDS,
blockwise_wei_copy
.
Run
(
p_wei_global_block_begin
,
p_wei_block_0
);
p_wei_global_block_begin
+=
CPerBlock
*
wei_ek_global_desc
.
GetStride
(
I0
);
bool
even_loop
=
true
;
for
(
unsigned
c_block_data_begin
=
CPerBlock
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_wei_global_block_begin
+=
CPerBlock
*
wei_ek_global_desc
.
GetStride
(
I0
),
even_loop
=
!
even_loop
)
{
__syncthreads
();
Float
*
p_in_block_now
=
even_loop
?
p_in_block_0
:
p_in_block_1
;
Float
*
p_wei_block_now
=
even_loop
?
p_wei_block_0
:
p_wei_block_1
;
Float
*
p_in_block_next
=
even_loop
?
p_in_block_1
:
p_in_block_0
;
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_1
:
p_wei_block_0
;
// preload next data
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global
,
c_block_data_begin
,
ho_block_data_begin
,
wo_block_data_begin
,
n_block_data_begin
,
p_in_block_next
,
h_block_pad_low
,
w_block_pad_low
,
h_block_pad_up
,
w_block_pad_up
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
Run
(
p_wei_global_block_begin
,
p_wei_block_next
);
#endif
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
Run
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block_now
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
}
}
}
// last computation
{
__syncthreads
();
Float
*
p_in_block_now
=
even_loop
?
p_in_block_0
:
p_in_block_1
;
Float
*
p_wei_block_now
=
even_loop
?
p_wei_block_0
:
p_wei_block_1
;
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
Run
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block_now
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
}
}
}
const
auto
matrix_c_index
=
blockwise_batch_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
unsigned
ho_thread_data_begin
=
matrix_c_index
.
batch
;
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row
;
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
col
/
NPerBlock
;
const
unsigned
n_thread_data_begin
=
matrix_c_index
.
col
-
wo_thread_data_begin
*
NPerBlock
;
#if 0
printf("block %u %u, %u %u %u %u, %u %u %u %u, %f \n",
get_block_1d_id(), get_thread_local_1d_id(),
ho_block_data_begin, k_block_data_begin, wo_block_data_begin, n_block_data_begin,
ho_thread_data_begin, k_thread_data_begin, wo_thread_data_begin, n_thread_data_begin,
p_out_thread[0]);
#endif
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[K,Ho,Wo,N]
constexpr
auto
reorder_khwn_from_hkwn
=
Sequence
<
1
,
0
,
2
,
3
>
{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_khwn_from_hkwn
);
}
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
deleted
100644 → 0
View file @
4543d17a
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
HoPerBlock
,
unsigned
WoPerBlock
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
constexpr
unsigned
NPerThread
=
NPerBlock
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_kcsr_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_nkhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
S
=
wei_kcsr_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_kcsr_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
R
-
1
;
// divide block work: NCHW
constexpr
unsigned
NBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
HBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
WBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
// tensor view of un-reorderd blockwise input and weight (imaginary)
constexpr
auto
in_nchw_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{});
constexpr
auto
wei_kcsr_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
S
,
R
>
{});
// tensor view of reordered blockwise input and weight in LDS
constexpr
auto
reorder_srck_from_kcsr
=
Sequence
<
2
,
3
,
1
,
0
>
{};
constexpr
auto
wei_srck_block_desc
=
make_ConstantTensorDescriptor
(
wei_kcsr_block_desc
.
GetLengths
().
ReorderByGetNewFromOld
(
reorder_srck_from_kcsr
));
constexpr
auto
reorder_chwn_from_nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
constexpr
auto
in_chwn_block_desc
=
make_ConstantTensorDescriptor
(
in_nchw_block_desc
.
GetLengths
().
ReorderByGetNewFromOld
(
reorder_chwn_from_nchw
));
// tensor view of threadwise output in register
constexpr
auto
out_hkwn_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
HoPerThread
,
KPerThread
,
WoPerThread
,
NPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_nchw_block_desc, "in_nchw_block_desc");
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_kcsr_block_desc, "wei_kcsr_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// my block work
unsigned
itmp
=
get_block_1d_id
();
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
HBlockWork
*
WBlockWork
);
itmp
-=
k_block_work_id
*
(
HBlockWork
*
WBlockWork
);
const
unsigned
h_block_work_id
=
itmp
/
WBlockWork
;
const
unsigned
w_block_work_id
=
itmp
-
h_block_work_id
*
WBlockWork
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
ho_block_data_begin
=
h_block_work_id
*
HoPerBlock
;
const
unsigned
wo_block_data_begin
=
w_block_work_id
*
WoPerBlock
;
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
constexpr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
true
,
false
,
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
true
>
{};
// LDS
constexpr
unsigned
in_block_size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_srck_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
// register
Float
p_out_thread
[
out_hkwn_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
in_nchw_global_desc
.
GetLength
(
I1
);
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
#if 1
// input: global mem to LDS,
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
in_chwn_block_desc
,
p_in_block
,
in_nchw_block_desc
.
GetLengths
(),
reorder_chwn_from_nchw
);
#else
// input: global mem to LDS,
// no format conversion, this is wrong, for performance study only!
Blockwise4dTensorCopy
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
in_nchw_block_desc
,
p_in_block
,
in_nchw_block_desc
.
GetLengths
());
#endif
#if 1
// weight: global mem to LDS,
// convert [K,C,S,R] to [S,R,C,K]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
wei_kcsr_global_desc
,
p_wei_global
+
wei_kcsr_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_srck_block_desc
,
p_wei_block
,
wei_kcsr_block_desc
.
GetLengths
(),
reorder_srck_from_kcsr
);
#else
// weight: global mem to LDS,
// no format conversion, this is wrong, for performance study only!
Blockwise4dTensorCopy
<
BlockSize
>
(
wei_kcsr_global_desc
,
p_wei_global
+
wei_kcsr_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_kcsr_block_desc
,
p_wei_block
,
wei_kcsr_block_desc
.
GetLengths
());
#endif
__syncthreads
();
#if 1
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
}
}
#endif
}
const
auto
matrix_c_index
=
blockwise_batch_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
#if 0
printf("%u %u, %u %u %u\n",get_block_1d_id(), get_thread_local_1d_id(), matrix_c_index.batch, matrix_c_index.row, matrix_c_index.col);
#endif
const
unsigned
ho_thread_data_begin
=
matrix_c_index
.
batch
;
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row
;
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
col
/
NPerThread
;
#if 1
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo]
constexpr
auto
reorder_nkhw_from_hkwn
=
Sequence
<
3
,
1
,
0
,
2
>
{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_nkhw_from_hkwn
);
#else
// output: register to global mem,
// no format conversion, assume register is in [N,K,Ho,Wo], this is wrong, for performance
// study only!
constexpr
auto
out_nkhw_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
KPerThread
,
HoPerThread
,
WoPerThread
>
{});
threadwise_4d_tensor_copy
(
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_nkhw_thread_desc
.
GetLengths
());
#endif
}
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
deleted
100644 → 0
View file @
4543d17a
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
HoPerBlock
,
unsigned
WoPerBlock
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
static_assert
(
NPerBlock
%
NPerThread
==
0
,
"wrong! NPerBlock % NPerThread !=0"
);
static_assert
((
NPerThread
<
NPerBlock
&&
WoPerThread
==
1
)
||
NPerThread
==
NPerBlock
,
"wrong!"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_srck_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_nkhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
S
=
wei_srck_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
R
=
wei_srck_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
R
-
1
;
// divide block work: NCHW
constexpr
unsigned
NBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
HBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
WBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
unsigned
itmp
=
get_block_1d_id
();
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
HBlockWork
*
WBlockWork
);
itmp
-=
k_block_work_id
*
(
HBlockWork
*
WBlockWork
);
const
unsigned
h_block_work_id
=
itmp
/
WBlockWork
;
const
unsigned
w_block_work_id
=
itmp
-
h_block_work_id
*
WBlockWork
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
ho_block_data_begin
=
h_block_work_id
*
HoPerBlock
;
const
unsigned
wo_block_data_begin
=
w_block_work_id
*
WoPerBlock
;
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// tensor view of un-reorderd blockwise input and weight (imaginary)
constexpr
auto
in_nchw_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{});
constexpr
auto
wei_srck_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
CPerBlock
,
KPerBlock
>
{});
// tensor view of reordered blockwise input and weight in LDS
constexpr
auto
reorder_chwn_from_nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
constexpr
auto
in_chwn_block_desc
=
make_ConstantTensorDescriptor
(
in_nchw_block_desc
.
GetLengths
().
ReorderByGetNewFromOld
(
reorder_chwn_from_nchw
));
// tensor view of threadwise output in register
constexpr
auto
out_hkwn_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
HoPerThread
,
KPerThread
,
WoPerThread
,
NPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_nchw_block_desc, "in_nchw_block_desc");
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// blockwise copy
// wei: format is [S,R,C,K], no conversion needed
constexpr
auto
blockwise_wei_copy
=
Blockwise4dTensorCopy1
<
BlockSize
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
decltype
(
wei_srck_block_desc
.
GetLengths
())
>
{};
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
constexpr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
true
,
false
,
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
true
>
{};
// LDS
constexpr
unsigned
in_block_size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_srck_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
// register
Float
p_out_thread
[
out_hkwn_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
in_nchw_global_desc
.
GetLength
(
I1
);
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
#if 1
// input: global mem to LDS,
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
in_chwn_block_desc
,
p_in_block
,
in_nchw_block_desc
.
GetLengths
(),
reorder_chwn_from_nchw
);
#endif
#if 1
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
#endif
__syncthreads
();
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
}
}
}
const
auto
matrix_c_index
=
blockwise_batch_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
unsigned
ho_thread_data_begin
=
matrix_c_index
.
batch
;
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row
;
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
col
/
NPerBlock
;
const
unsigned
n_thread_data_begin
=
matrix_c_index
.
col
-
wo_thread_data_begin
*
NPerBlock
;
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo]
constexpr
auto
reorder_nkhw_from_hkwn
=
Sequence
<
3
,
1
,
0
,
2
>
{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src
(
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_nkhw_from_hkwn
);
}
src/include/gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_buffer.hip.hpp
View file @
5fd40ad7
...
...
@@ -57,11 +57,11 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
constexpr
unsigned
Ho
=
out_khwn_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
S
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
R
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
X
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
B
=
N
*
Hi
*
Wi
;
constexpr
unsigned
BGhostRead
=
(
S
-
1
)
*
Wi
+
(
R
-
1
);
constexpr
unsigned
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
// divide block work by 2d: [K, B]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
...
...
@@ -75,7 +75,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// flattend (2d) tensor view of gridwise input
constexpr
auto
in_cb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
B
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
S
*
R
,
K
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
Y
*
X
,
K
>
{});
// tensor view of blockwise input and weight
// be careful of alignment
...
...
@@ -83,10 +83,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
Sequence
<
CPerBlock
,
BPerBlock
+
BGhostRead
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
S
*
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
auto
wei_csrk_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
S
,
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_kb_thread_desc
=
...
...
@@ -138,7 +138,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
#endif
// blockwise wei copy
// format is [CPerBlock*
S*R
,KPerBlock]
// format is [CPerBlock*
Y*X
,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
...
...
@@ -166,7 +166,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[C,
S,R
,K]
// a_mtx[C,K] is a sub-matrix of wei_block[C,
Y,X
,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
...
...
@@ -275,9 +275,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// compute on current data
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if 1
...
...
@@ -305,9 +305,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
__syncthreads
();
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if 0
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
deleted
100644 → 0
View file @
4543d17a
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
BPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
GemmThreadPerColumnPerCluster
,
unsigned
GemmThreadPerRowPerCluster
,
unsigned
GemmMPerThreadSubC
,
unsigned
GemmNPerThreadSubC
,
unsigned
GemmMLevel0Cluster
,
unsigned
GemmNLevel0Cluster
,
unsigned
GemmMLevel1Cluster
,
unsigned
GemmNLevel1Cluster
,
unsigned
GemmKPerThreadLoop
,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_cnhw_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_csrk_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_knhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_cnhw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
N
=
in_cnhw_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Hi
=
in_cnhw_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wi
=
in_cnhw_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
out_knhw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_knhw_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_knhw_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
S
=
wei_csrk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
R
=
wei_csrk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
B
=
N
*
Hi
*
Wi
;
constexpr
unsigned
BGhostRead
=
(
S
-
1
)
*
Wi
+
(
R
-
1
);
// divide block work by 2d: [K, B]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
BBlockWork
=
(
B
+
BPerBlock
-
1
)
/
BPerBlock
;
const
unsigned
k_block_work_id
=
get_block_1d_id
()
/
BBlockWork
;
const
unsigned
b_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
BBlockWork
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
b_block_data_begin
=
b_block_work_id
*
BPerBlock
;
// flattend (2d) tensor view of gridwise input
constexpr
auto
in_cb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
B
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
S
*
R
,
K
>
{});
// tensor view of blockwise input and weight
// be careful of alignment
constexpr
auto
in_cb_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
BPerBlock
+
BGhostRead
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
S
*
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
auto
wei_csrk_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
S
,
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_kb_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
BPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_cnhw_global_desc, "in_cnhw_global_desc");
print_ConstantTensorDescriptor(wei_csrk_global_desc, "wei_csrk_global_desc");
print_ConstantTensorDescriptor(out_knhw_global_desc, "out_knhw_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
#if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxb_block_mtx_desc),
decltype(c_kxb_thread_mtx_desc),
true,
false,
false,
GemmKPerThreadLoop,
GemmThreadPerColumnPerCluster,
GemmThreadPerRowPerCluster,
true>{};
#else
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
>
{};
#endif
// LDS: be careful of alignment
constexpr
unsigned
in_block_size
=
in_cb_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
unsigned
wei_block_size
=
wei_csrk_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
:
WeiBlockCopyDataPerRead
;
__shared__
Float
p_in_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_csrk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_csrk_global_desc
.
GetStride
(
I0
),
__syncthreads
())
{
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
// weight: global mem to LDS,
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
__syncthreads
();
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if 1
blockwise_gemm
.
Run
#elif 0
blockwise_gemm
.
Run_v2
#elif 0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#endif
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
}
}
}
// output: register to global mem,
const
auto
c_thread_mtx_begin
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
unsigned
k_thread_data_begin
=
k_block_data_begin
+
c_thread_mtx_begin
.
row
;
const
unsigned
b_thread_data_begin
=
b_block_data_begin
+
c_thread_mtx_begin
.
col
;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.row,
matrix_c_index.col,
k_data_begin,
b_data_begin,
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
}
#endif
for
(
unsigned
k
=
0
;
k
<
out_kb_thread_desc
.
GetLength
(
I0
);
++
k
)
{
for
(
unsigned
b
=
0
;
b
<
out_kb_thread_desc
.
GetLength
(
I1
);
++
b
)
{
const
auto
c_thread_mtx_distance
=
blockwise_gemm
.
GetDistanceFromBeginOfThreadMatrixC
(
k
,
b
);
unsigned
k_data
=
k_thread_data_begin
+
c_thread_mtx_distance
.
row
;
unsigned
b_data
=
b_thread_data_begin
+
c_thread_mtx_distance
.
col
;
unsigned
n_data
=
b_data
/
(
Hi
*
Wi
);
unsigned
itmp
=
b_data
-
n_data
*
(
Hi
*
Wi
);
unsigned
h_data
=
itmp
/
Wi
;
unsigned
w_data
=
itmp
-
h_data
*
Wi
;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
k,
b,
k_data,
n_data,
h_data,
w_data,
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]);
}
#endif
if
(
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
p_out_global
[
out_knhw_global_desc
.
Get1dIndex
(
k_data
,
n_data
,
h_data
,
w_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get1dIndex
(
k
,
b
)];
}
}
}
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment