Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
ce0182ce
Commit
ce0182ce
authored
Mar 09, 2019
by
Chao Liu
Browse files
Merge branch 'master' into implicit_gemm_fp16
parents
8edbc659
f54cad7d
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
182 additions
and
182 deletions
+182
-182
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
+19
-19
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
...ice_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+19
-19
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+18
-18
driver/driver.hip.cpp
driver/driver.hip.cpp
+60
-60
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+13
-13
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+15
-15
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...mm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+20
-20
src/include/tensor.hpp
src/include/tensor.hpp
+3
-3
src/include/threadwise_direct_convolution.hip.hpp
src/include/threadwise_direct_convolution.hip.hpp
+15
-15
No files found.
driver/device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn.hpp
→
driver/device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn.hpp
View file @
ce0182ce
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn.hip.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn
(
InDesc
,
void
device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kc
sr
,
const
Tensor
<
T
>&
wei_kc
yx
,
OutDesc
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
unsigned
nrepeat
)
...
@@ -18,7 +18,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -18,7 +18,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kc
sr
_desc
=
WeiDesc
{};
constexpr
auto
wei_kc
yx
_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
...
@@ -28,22 +28,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -28,22 +28,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kc
sr
_desc
.
GetLength
(
I0
);
constexpr
unsigned
K
=
wei_kc
yx
_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kc
sr
_desc
.
GetLength
(
I1
);
constexpr
unsigned
C
=
wei_kc
yx
_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kc
sr
_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_kc
yx
_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kc
sr
_desc
.
GetLength
(
I3
);
constexpr
unsigned
X
=
wei_kc
yx
_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_c
sr
k_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_c
yx
k_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_c
sr
k_desc
,
std
::
cout
<<
"wei_c
sr
k_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_c
yx
k_desc
,
std
::
cout
<<
"wei_c
yx
k_desc: "
);
Tensor
<
T
>
wei_c
sr
k
(
make_TensorDescriptor
(
wei_c
sr
k_desc
));
Tensor
<
T
>
wei_c
yx
k
(
make_TensorDescriptor
(
wei_c
yx
k_desc
));
auto
f_reorder_kc
sr2csr
k
=
[
&
](
auto
k
,
auto
c
,
auto
s
,
auto
r
)
{
auto
f_reorder_kc
yx2cyx
k
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_c
sr
k
(
c
,
s
,
r
,
k
)
=
wei_kc
sr
(
k
,
c
,
s
,
r
);
wei_c
yx
k
(
c
,
y
,
x
,
k
)
=
wei_kc
yx
(
k
,
c
,
y
,
x
);
};
};
make_ParallelTensorFunctor
(
f_reorder_kc
sr2csr
k
,
K
,
C
,
Y
,
X
)(
make_ParallelTensorFunctor
(
f_reorder_kc
yx2cyx
k
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// reorder input
// reorder input
...
@@ -67,11 +67,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -67,11 +67,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
std
::
size_t
data_sz
=
sizeof
(
T
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_chwn_device_buf
(
data_sz
*
in_chwn
.
mDesc
.
GetElementSpace
());
DeviceMem
in_chwn_device_buf
(
data_sz
*
in_chwn
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_c
sr
k_device_buf
(
data_sz
*
wei_c
sr
k
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_c
yx
k_device_buf
(
data_sz
*
wei_c
yx
k
.
mDesc
.
GetElementSpace
());
DeviceMem
out_khwn_device_buf
(
data_sz
*
out_khwn
.
mDesc
.
GetElementSpace
());
DeviceMem
out_khwn_device_buf
(
data_sz
*
out_khwn
.
mDesc
.
GetElementSpace
());
in_chwn_device_buf
.
ToDevice
(
in_chwn
.
mData
.
data
());
in_chwn_device_buf
.
ToDevice
(
in_chwn
.
mData
.
data
());
wei_c
sr
k_device_buf
.
ToDevice
(
wei_c
sr
k
.
mData
.
data
());
wei_c
yx
k_device_buf
.
ToDevice
(
wei_c
yx
k
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
#if 1
#if 1
...
@@ -257,11 +257,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -257,11 +257,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
float
time
=
launch_kernel
(
float
time
=
launch_kernel
(
gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn
<
GridSize
,
gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
decltype
(
in_chwn_desc
),
decltype
(
in_chwn_desc
),
decltype
(
wei_c
sr
k_desc
),
decltype
(
wei_c
yx
k_desc
),
decltype
(
out_khwn_desc
),
decltype
(
out_khwn_desc
),
NPerBlock
,
NPerBlock
,
KPerBlock
,
KPerBlock
,
...
@@ -289,7 +289,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -289,7 +289,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_c
sr
k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_c
yx
k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
...
...
driver/device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded.hpp
→
driver/device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded.hpp
View file @
ce0182ce
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded.hip.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
void
device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded
(
InDesc
,
void
device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kc
sr
,
const
Tensor
<
T
>&
wei_kc
yx
,
OutDesc
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Tensor
<
T
>&
out_nkhw
,
LowerPads
,
LowerPads
,
...
@@ -20,7 +20,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -20,7 +20,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kc
sr
_desc
=
WeiDesc
{};
constexpr
auto
wei_kc
yx
_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
...
@@ -30,22 +30,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -30,22 +30,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kc
sr
_desc
.
GetLength
(
I0
);
constexpr
unsigned
K
=
wei_kc
yx
_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kc
sr
_desc
.
GetLength
(
I1
);
constexpr
unsigned
C
=
wei_kc
yx
_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kc
sr
_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_kc
yx
_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kc
sr
_desc
.
GetLength
(
I3
);
constexpr
unsigned
X
=
wei_kc
yx
_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_c
sr
k_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_c
yx
k_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_c
sr
k_desc
,
std
::
cout
<<
"wei_c
sr
k_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_c
yx
k_desc
,
std
::
cout
<<
"wei_c
yx
k_desc: "
);
Tensor
<
T
>
wei_c
sr
k
(
make_TensorDescriptor
(
wei_c
sr
k_desc
));
Tensor
<
T
>
wei_c
yx
k
(
make_TensorDescriptor
(
wei_c
yx
k_desc
));
auto
f_reorder_kc
sr2csr
k
=
[
&
](
auto
k
,
auto
c
,
auto
s
,
auto
r
)
{
auto
f_reorder_kc
yx2cyx
k
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_c
sr
k
(
c
,
s
,
r
,
k
)
=
wei_kc
sr
(
k
,
c
,
s
,
r
);
wei_c
yx
k
(
c
,
y
,
x
,
k
)
=
wei_kc
yx
(
k
,
c
,
y
,
x
);
};
};
make_ParallelTensorFunctor
(
f_reorder_kc
sr2csr
k
,
K
,
C
,
Y
,
X
)(
make_ParallelTensorFunctor
(
f_reorder_kc
yx2cyx
k
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// reorder input
// reorder input
...
@@ -69,11 +69,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -69,11 +69,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
std
::
size_t
data_sz
=
sizeof
(
T
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_chwn_device_buf
(
data_sz
*
in_chwn
.
mDesc
.
GetElementSpace
());
DeviceMem
in_chwn_device_buf
(
data_sz
*
in_chwn
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_c
sr
k_device_buf
(
data_sz
*
wei_c
sr
k
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_c
yx
k_device_buf
(
data_sz
*
wei_c
yx
k
.
mDesc
.
GetElementSpace
());
DeviceMem
out_khwn_device_buf
(
data_sz
*
out_khwn
.
mDesc
.
GetElementSpace
());
DeviceMem
out_khwn_device_buf
(
data_sz
*
out_khwn
.
mDesc
.
GetElementSpace
());
in_chwn_device_buf
.
ToDevice
(
in_chwn
.
mData
.
data
());
in_chwn_device_buf
.
ToDevice
(
in_chwn
.
mData
.
data
());
wei_c
sr
k_device_buf
.
ToDevice
(
wei_c
sr
k
.
mData
.
data
());
wei_c
yx
k_device_buf
.
ToDevice
(
wei_c
yx
k
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
#if 0
#if 0
...
@@ -250,11 +250,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -250,11 +250,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
float
time
=
launch_kernel
(
float
time
=
launch_kernel
(
gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded
<
GridSize
,
gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
decltype
(
in_chwn_desc
),
decltype
(
in_chwn_desc
),
decltype
(
wei_c
sr
k_desc
),
decltype
(
wei_c
yx
k_desc
),
decltype
(
out_khwn_desc
),
decltype
(
out_khwn_desc
),
LowerPads
,
LowerPads
,
UpperPads
,
UpperPads
,
...
@@ -274,7 +274,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...
@@ -274,7 +274,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
dim3
(
BlockSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_c
sr
k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_c
yx
k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
...
...
driver/device_implicit_gemm_convolution_2_chwn_c
sr
k_khwn.hpp
→
driver/device_implicit_gemm_convolution_2_chwn_c
yx
k_khwn.hpp
View file @
ce0182ce
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_chwn_c
sr
k_khwn_lds_double_buffer.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_chwn_c
yx
k_khwn_lds_double_buffer.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_2_chwn_c
sr
k_khwn
(
InDesc
,
void
device_implicit_gemm_convolution_2_chwn_c
yx
k_khwn
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kc
sr
,
const
Tensor
<
T
>&
wei_kc
yx
,
OutDesc
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
unsigned
nrepeat
)
...
@@ -18,7 +18,7 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
...
@@ -18,7 +18,7 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kc
sr
_desc
=
WeiDesc
{};
constexpr
auto
wei_kc
yx
_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
N
=
in_nchw_desc
.
GetLength
(
I0
);
constexpr
unsigned
N
=
in_nchw_desc
.
GetLength
(
I0
);
...
@@ -28,10 +28,10 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
...
@@ -28,10 +28,10 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kc
sr
_desc
.
GetLength
(
I0
);
constexpr
unsigned
K
=
wei_kc
yx
_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kc
sr
_desc
.
GetLength
(
I1
);
constexpr
unsigned
C
=
wei_kc
yx
_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kc
sr
_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_kc
yx
_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kc
sr
_desc
.
GetLength
(
I3
);
constexpr
unsigned
X
=
wei_kc
yx
_desc
.
GetLength
(
I3
);
constexpr
unsigned
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
constexpr
unsigned
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
...
@@ -48,14 +48,14 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
...
@@ -48,14 +48,14 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
Hi
,
Hi
,
Wi
)(
std
::
thread
::
hardware_concurrency
());
Wi
)(
std
::
thread
::
hardware_concurrency
());
// convert wei_kc
sr
to wei_c
sr
k
// convert wei_kc
yx
to wei_c
yx
k
auto
wei_c
sr
k_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_c
yx
k_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_c
sr
k_desc
,
std
::
cout
<<
"wei_c
sr
k_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_c
yx
k_desc
,
std
::
cout
<<
"wei_c
yx
k_desc: "
);
Tensor
<
T
>
wei_c
sr
k
(
make_TensorDescriptor
(
wei_c
sr
k_desc
));
Tensor
<
T
>
wei_c
yx
k
(
make_TensorDescriptor
(
wei_c
yx
k_desc
));
make_ParallelTensorFunctor
(
make_ParallelTensorFunctor
(
[
&
](
auto
k
,
auto
c
,
auto
s
,
auto
r
)
{
wei_c
sr
k
(
c
,
s
,
r
,
k
)
=
wei_kc
sr
(
k
,
c
,
s
,
r
);
},
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_c
yx
k
(
c
,
y
,
x
,
k
)
=
wei_kc
yx
(
k
,
c
,
y
,
x
);
},
K
,
K
,
C
,
C
,
Y
,
Y
,
...
@@ -200,22 +200,22 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
...
@@ -200,22 +200,22 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
std
::
size_t
data_sz
=
sizeof
(
T
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_chwn_device_buf
(
data_sz
*
(
in_chwn
.
mDesc
.
GetElementSpace
()
+
BGhostRead
+
DeviceMem
in_chwn_device_buf
(
data_sz
*
(
in_chwn
.
mDesc
.
GetElementSpace
()
+
BGhostRead
+
BPerBlock
));
// reserve extra space for BGhostRead
BPerBlock
));
// reserve extra space for BGhostRead
DeviceMem
wei_c
sr
k_device_buf
(
data_sz
*
wei_c
sr
k
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_c
yx
k_device_buf
(
data_sz
*
wei_c
yx
k
.
mDesc
.
GetElementSpace
());
DeviceMem
out_khwn_device_buf
(
data_sz
*
out_khwn
.
mDesc
.
GetElementSpace
());
DeviceMem
out_khwn_device_buf
(
data_sz
*
out_khwn
.
mDesc
.
GetElementSpace
());
in_chwn_device_buf
.
ToDevice
(
in_chwn
.
mData
.
data
());
in_chwn_device_buf
.
ToDevice
(
in_chwn
.
mData
.
data
());
wei_c
sr
k_device_buf
.
ToDevice
(
wei_c
sr
k
.
mData
.
data
());
wei_c
yx
k_device_buf
.
ToDevice
(
wei_c
yx
k
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
float
time
=
float
time
=
launch_kernel
(
gridwise_implicit_gemm_convolution_2_chwn_c
sr
k_khwn_lds_double_buffer
<
launch_kernel
(
gridwise_implicit_gemm_convolution_2_chwn_c
yx
k_khwn_lds_double_buffer
<
GridSize
,
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
decltype
(
in_chwn_desc
),
decltype
(
in_chwn_desc
),
decltype
(
wei_c
sr
k_desc
),
decltype
(
wei_c
yx
k_desc
),
decltype
(
out_khwn_desc
),
decltype
(
out_khwn_desc
),
BPerBlock
,
BPerBlock
,
KPerBlock
,
KPerBlock
,
...
@@ -240,7 +240,7 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
...
@@ -240,7 +240,7 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_c
sr
k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_c
yx
k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
...
...
driver/driver.hip.cpp
View file @
ce0182ce
...
@@ -9,9 +9,9 @@
...
@@ -9,9 +9,9 @@
#include "conv_common.hip.hpp"
#include "conv_common.hip.hpp"
#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2.hpp"
#include "device_direct_convolution_2.hpp"
#include "device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn.hpp"
#include "device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn.hpp"
#include "device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded.hpp"
#include "device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded.hpp"
#include "device_implicit_gemm_convolution_2_chwn_c
sr
k_khwn.hpp"
#include "device_implicit_gemm_convolution_2_chwn_c
yx
k_khwn.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
{
{
...
@@ -108,7 +108,7 @@ auto make_TensorDescriptor(TConstTensorDesc)
...
@@ -108,7 +108,7 @@ auto make_TensorDescriptor(TConstTensorDesc)
template
<
class
T
,
class
LowerPads
,
class
UpperPads
>
template
<
class
T
,
class
LowerPads
,
class
UpperPads
>
void
host_direct_convolution
(
void
host_direct_convolution
(
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
wei_kc
sr
,
Tensor
<
T
>&
out
,
LowerPads
,
UpperPads
)
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
wei_kc
yx
,
Tensor
<
T
>&
out
,
LowerPads
,
UpperPads
)
{
{
unsigned
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
unsigned
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
unsigned
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
unsigned
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
...
@@ -118,18 +118,18 @@ void host_direct_convolution(
...
@@ -118,18 +118,18 @@ void host_direct_convolution(
auto
f
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
auto
f
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei_kc
sr
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
int
c
=
0
;
c
<
wei_kc
yx
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
{
for
(
int
y
=
0
;
y
<
wei_kc
sr
.
mDesc
.
GetLengths
()[
2
];
++
y
)
for
(
int
y
=
0
;
y
<
wei_kc
yx
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
{
int
hi
=
ho
+
y
-
h_pad_low
;
int
hi
=
ho
+
y
-
h_pad_low
;
for
(
int
x
=
0
;
x
<
wei_kc
sr
.
mDesc
.
GetLengths
()[
3
];
++
x
)
for
(
int
x
=
0
;
x
<
wei_kc
yx
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
{
int
wi
=
wo
+
x
-
w_pad_low
;
int
wi
=
wo
+
x
-
w_pad_low
;
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
{
{
v
+=
in_nchw
(
n
,
c
,
hi
,
wi
)
*
wei_kc
sr
(
k
,
c
,
y
,
x
);
v
+=
in_nchw
(
n
,
c
,
hi
,
wi
)
*
wei_kc
yx
(
k
,
c
,
y
,
x
);
}
}
}
}
}
}
...
@@ -148,7 +148,7 @@ void host_direct_convolution(
...
@@ -148,7 +148,7 @@ void host_direct_convolution(
template
<
class
T
,
class
LowerPads
,
class
UpperPads
>
template
<
class
T
,
class
LowerPads
,
class
UpperPads
>
void
host_winograd_3x3_convolution
(
void
host_winograd_3x3_convolution
(
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
wei_kc
sr
,
Tensor
<
T
>&
out
,
LowerPads
,
UpperPads
)
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
wei_kc
yx
,
Tensor
<
T
>&
out
,
LowerPads
,
UpperPads
)
{
{
constexpr
std
::
size_t
HoPerTile
=
2
;
constexpr
std
::
size_t
HoPerTile
=
2
;
constexpr
std
::
size_t
WoPerTile
=
2
;
constexpr
std
::
size_t
WoPerTile
=
2
;
...
@@ -158,9 +158,9 @@ void host_winograd_3x3_convolution(
...
@@ -158,9 +158,9 @@ void host_winograd_3x3_convolution(
std
::
size_t
HI
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
HI
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
WI
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
WI
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
K
=
wei_kc
sr
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
K
=
wei_kc
yx
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
Y
=
wei_kc
sr
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Y
=
wei_kc
yx
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
X
=
wei_kc
sr
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
X
=
wei_kc
yx
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
HO
=
out
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
HO
=
out
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
WO
=
out
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
WO
=
out
.
mDesc
.
GetLengths
()[
3
];
...
@@ -259,49 +259,49 @@ void host_winograd_3x3_convolution(
...
@@ -259,49 +259,49 @@ void host_winograd_3x3_convolution(
};
};
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
wei_transform
(
k
,
c
,
0
,
0
)
=
wei_kc
sr
(
k
,
c
,
0
,
0
);
wei_transform
(
k
,
c
,
0
,
0
)
=
wei_kc
yx
(
k
,
c
,
0
,
0
);
wei_transform
(
k
,
c
,
0
,
1
)
=
wei_transform
(
k
,
c
,
0
,
1
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
2
);
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
2
)
=
wei_transform
(
k
,
c
,
0
,
2
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
2
);
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
3
)
=
wei_kc
sr
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
3
)
=
wei_kc
yx
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
1
,
0
)
=
wei_transform
(
k
,
c
,
1
,
0
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
0
);
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
1
,
1
)
=
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
1
,
1
)
=
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
0
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
0
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
1
,
2
)
=
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
1
,
2
)
=
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
0
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
0
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
1
,
3
)
=
wei_transform
(
k
,
c
,
1
,
3
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
2
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
2
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
2
,
0
)
=
wei_transform
(
k
,
c
,
2
,
0
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kc
sr
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
0
);
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kc
yx
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
2
,
1
)
=
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
2
,
1
)
=
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
0
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
0
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
2
,
2
)
=
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
2
,
2
)
=
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
0
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
0
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
1
,
2
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.25
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
2
,
3
)
=
wei_transform
(
k
,
c
,
2
,
3
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
0
,
2
)
-
0.5
*
wei_kc
sr
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.5
*
wei_kc
yx
(
k
,
c
,
0
,
2
)
-
0.5
*
wei_kc
yx
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
0
)
=
wei_kc
sr
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
3
,
0
)
=
wei_kc
yx
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
3
,
1
)
=
wei_transform
(
k
,
c
,
3
,
1
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
0
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
0
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
2
)
=
wei_transform
(
k
,
c
,
3
,
2
)
=
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
0
)
-
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kc
sr
(
k
,
c
,
2
,
2
);
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
0
)
-
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kc
yx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
3
)
=
wei_kc
sr
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
3
,
3
)
=
wei_kc
yx
(
k
,
c
,
2
,
2
);
};
};
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
htile
,
auto
wtile
)
{
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
htile
,
auto
wtile
)
{
...
@@ -569,16 +569,16 @@ int main(int argc, char* argv[])
...
@@ -569,16 +569,16 @@ int main(int argc, char* argv[])
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
in_nchw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_nchw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kc
sr
_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
wei_kc
yx
_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kc
sr
_desc
,
lower_pads
,
upper_pads
);
in_nchw_desc
,
wei_kc
yx
_desc
,
lower_pads
,
upper_pads
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kc
sr
_desc
,
std
::
cout
<<
"wei_kc
sr
_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kc
yx
_desc
,
std
::
cout
<<
"wei_kc
yx
_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
Tensor
<
half
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
half
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
half
>
wei_kc
sr
(
make_TensorDescriptor
(
wei_kc
sr
_desc
));
Tensor
<
half
>
wei_kc
yx
(
make_TensorDescriptor
(
wei_kc
yx
_desc
));
Tensor
<
half
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
half
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
half
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
half
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
...
@@ -597,13 +597,13 @@ int main(int argc, char* argv[])
...
@@ -597,13 +597,13 @@ int main(int argc, char* argv[])
{
{
#if 0
#if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kc
sr
.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kc
yx
.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
1
#elif
1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kc
sr
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kc
yx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 1
#elif 1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
2
,
2
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
2
,
2
},
num_thread
);
wei_kc
sr
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kc
yx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#endif
#endif
}
}
...
@@ -613,17 +613,17 @@ int main(int argc, char* argv[])
...
@@ -613,17 +613,17 @@ int main(int argc, char* argv[])
#elif
0
#elif
0
device_direct_convolution_2
device_direct_convolution_2
#elif 1
#elif 1
device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn
device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn
#elif 0
#elif 0
device_implicit_gemm_convolution_2_chwn_c
sr
k_khwn
device_implicit_gemm_convolution_2_chwn_c
yx
k_khwn
#endif
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kc
sr
_desc
,
wei_kc
sr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kc
yx
_desc
,
wei_kc
yx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 1
#elif 1
device_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded
(
in_nchw_desc
,
device_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kc
sr
_desc
,
wei_kc
yx
_desc
,
wei_kc
sr
,
wei_kc
yx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw_device
,
out_nkhw_device
,
lower_pads
,
lower_pads
,
...
@@ -636,18 +636,18 @@ int main(int argc, char* argv[])
...
@@ -636,18 +636,18 @@ int main(int argc, char* argv[])
#if 0
#if 0
if(Y == 3 && X == 3)
if(Y == 3 && X == 3)
{
{
host_winograd_3x3_convolution(in_nchw, wei_kc
sr
, out_nkhw_host, lower_pads, upper_pads);
host_winograd_3x3_convolution(in_nchw, wei_kc
yx
, out_nkhw_host, lower_pads, upper_pads);
}
}
else
else
{
{
host_direct_convolution(in_nchw, wei_kc
sr
, out_nkhw_host, lower_pads, upper_pads);
host_direct_convolution(in_nchw, wei_kc
yx
, out_nkhw_host, lower_pads, upper_pads);
}
}
check_error(out_nkhw_host, out_nkhw_device);
check_error(out_nkhw_host, out_nkhw_device);
#endif
#endif
#if 0
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kc
sr
: ", wei_kc
sr
.mData, ",") << std::endl;
LogRange(std::cout << "wei_kc
yx
: ", wei_kc
yx
.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
#endif
#endif
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn.hip.hpp
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn.hip.hpp
View file @
ce0182ce
...
@@ -35,7 +35,7 @@ template <unsigned GridSize,
...
@@ -35,7 +35,7 @@ template <unsigned GridSize,
unsigned
GemmKPerThreadLoop
,
unsigned
GemmKPerThreadLoop
,
unsigned
OutThreadCopyDataPerWrite
>
unsigned
OutThreadCopyDataPerWrite
>
__global__
void
__global__
void
gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn
(
const
Float
*
const
__restrict__
p_in_global
,
gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_out_global
)
{
{
...
@@ -52,7 +52,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -52,7 +52,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_c
sr
k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_c
yx
k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
...
@@ -62,8 +62,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -62,8 +62,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
N
=
out_khwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
N
=
out_khwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_c
sr
k_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_c
yx
k_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
X
=
wei_c
sr
k_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_c
yx
k_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
...
@@ -100,7 +100,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -100,7 +100,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
auto
wei_c
sr
k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
wei_c
yx
k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
...
@@ -133,7 +133,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -133,7 +133,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c
sr
k_block_desc
.
GetStride
(
I0
)
>
{});
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c
yx
k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_cxwn_block_mtx_desc
=
constexpr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
...
@@ -168,7 +168,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -168,7 +168,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
in_chwn_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
in_chwn_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
unsigned
wei_block_size
=
constexpr
unsigned
wei_block_size
=
wei_c
sr
k_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
wei_c
yx
k_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
?
InBlockCopyDataPerRead
...
@@ -188,11 +188,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -188,11 +188,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_begin
=
const
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_c
sr
k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c
yx
k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_begin
+=
CPerBlock
*
in_chwn_global_desc
.
GetStride
(
I0
),
p_in_global_block_begin
+=
CPerBlock
*
in_chwn_global_desc
.
GetStride
(
I0
),
p_wei_global_block_begin
+=
CPerBlock
*
wei_c
sr
k_global_desc
.
GetStride
(
I0
),
p_wei_global_block_begin
+=
CPerBlock
*
wei_c
yx
k_global_desc
.
GetStride
(
I0
),
__syncthreads
())
__syncthreads
())
{
{
// input: global mem to LDS
// input: global mem to LDS
...
@@ -204,12 +204,12 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -204,12 +204,12 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
__syncthreads
();
__syncthreads
();
// a series of batched GEMM
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
for
(
unsigned
y
=
0
;
y
<
Y
;
++
y
)
{
{
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
for
(
unsigned
x
=
0
;
x
<
X
;
++
x
)
{
{
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c
sr
k_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c
yx
k_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_out_thread
,
p_out_thread
,
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
});
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
});
}
}
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded.hip.hpp
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded.hip.hpp
View file @
ce0182ce
...
@@ -27,7 +27,7 @@ template <unsigned GridSize,
...
@@ -27,7 +27,7 @@ template <unsigned GridSize,
unsigned
WoPerThread
,
unsigned
WoPerThread
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
>
unsigned
WeiBlockCopyThreadPerDim1
>
__global__
void
gridwise_implicit_gemm_convolution_1_chwn_c
sr
k_khwn_padded
(
__global__
void
gridwise_implicit_gemm_convolution_1_chwn_c
yx
k_khwn_padded
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_out_global
)
...
@@ -45,7 +45,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -45,7 +45,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_c
sr
k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_c
yx
k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
...
@@ -55,8 +55,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -55,8 +55,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
N
=
out_khwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
N
=
out_khwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
Y
=
wei_c
sr
k_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_c
yx
k_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
X
=
wei_c
sr
k_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_c
yx
k_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
HPadLow
=
LowerPads
{}.
Get
(
I0
);
constexpr
unsigned
HPadLow
=
LowerPads
{}.
Get
(
I0
);
constexpr
unsigned
WPadLow
=
LowerPads
{}.
Get
(
I1
);
constexpr
unsigned
WPadLow
=
LowerPads
{}.
Get
(
I1
);
...
@@ -92,7 +92,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -92,7 +92,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr
auto
in_chwn_block_desc
=
constexpr
auto
in_chwn_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
HiPerBlock
,
WiPerBlock
,
NPerBlock
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
HiPerBlock
,
WiPerBlock
,
NPerBlock
>
{});
constexpr
auto
wei_c
sr
k_block_desc
=
constexpr
auto
wei_c
yx
k_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{});
// flattened (2d) tensor view of wei in LDS
// flattened (2d) tensor view of wei in LDS
...
@@ -107,7 +107,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -107,7 +107,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
{
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_c
sr
k_block_desc, "wei_c
sr
k_block_desc");
print_ConstantTensorDescriptor(wei_c
yx
k_block_desc, "wei_c
yx
k_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
}
#endif
#endif
...
@@ -148,9 +148,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -148,9 +148,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr auto blockwise_wei_copy =
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Blockwise4dTensorCopy1<BlockSize,
Float,
Float,
decltype(wei_c
sr
k_global_desc),
decltype(wei_c
yx
k_global_desc),
decltype(wei_c
sr
k_block_desc),
decltype(wei_c
yx
k_block_desc),
decltype(wei_c
sr
k_block_desc.GetLengths())>{};
decltype(wei_c
yx
k_block_desc.GetLengths())>{};
#elif
0
#elif
0
// weight: format is [C*Y*X,K]
// weight: format is [C*Y*X,K]
constexpr
auto
blockwise_wei_copy
=
constexpr
auto
blockwise_wei_copy
=
...
@@ -177,7 +177,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -177,7 +177,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c
sr
k_block_desc
.
GetStride
(
I0
)
>
{});
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c
yx
k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_cxwn_block_mtx_desc
=
constexpr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
...
@@ -205,7 +205,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -205,7 +205,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
// LDS
// LDS
constexpr
unsigned
in_block_size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
unsigned
in_block_size
=
in_chwn_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_c
sr
k_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_c
yx
k_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
...
@@ -245,14 +245,14 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -245,14 +245,14 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
__syncthreads
();
__syncthreads
();
// a series of batched GEMM
// a series of batched GEMM
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
for
(
unsigned
y
=
0
;
y
<
Y
;
++
y
)
{
{
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
for
(
unsigned
x
=
0
;
x
<
X
;
++
x
)
{
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c
sr
k_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c
yx
k_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_out_thread
,
p_out_thread
,
f_accum
);
f_accum
);
}
}
...
...
src/include/gridwise_implicit_gemm_convolution_2_chwn_c
sr
k_khwn_lds_double_buffer.hip.hpp
→
src/include/gridwise_implicit_gemm_convolution_2_chwn_c
yx
k_khwn_lds_double_buffer.hip.hpp
View file @
ce0182ce
...
@@ -34,7 +34,7 @@ template <unsigned GridSize,
...
@@ -34,7 +34,7 @@ template <unsigned GridSize,
unsigned
WeiBlockCopyThreadPerDim1
,
unsigned
WeiBlockCopyThreadPerDim1
,
unsigned
InBlockCopyDataPerRead
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
gridwise_implicit_gemm_convolution_2_chwn_c
sr
k_khwn_lds_double_buffer
(
__global__
void
gridwise_implicit_gemm_convolution_2_chwn_c
yx
k_khwn_lds_double_buffer
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_out_global
)
...
@@ -45,7 +45,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -45,7 +45,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_c
sr
k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_c
yx
k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
...
@@ -57,8 +57,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -57,8 +57,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
constexpr
unsigned
Ho
=
out_khwn_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Ho
=
out_khwn_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_c
sr
k_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_c
yx
k_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
X
=
wei_c
sr
k_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_c
yx
k_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
B
=
N
*
Hi
*
Wi
;
constexpr
unsigned
B
=
N
*
Hi
*
Wi
;
constexpr
unsigned
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
constexpr
unsigned
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
...
@@ -85,7 +85,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -85,7 +85,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
auto
wei_c
sr
k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
wei_c
yx
k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
...
@@ -96,14 +96,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -96,14 +96,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
{
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(wei_c
sr
k_global_desc, "wei_c
sr
k_global_desc");
print_ConstantTensorDescriptor(wei_c
yx
k_global_desc, "wei_c
yx
k_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_c
sr
k_block_desc, "wei_c
sr
k_block_desc");
print_ConstantTensorDescriptor(wei_c
yx
k_block_desc, "wei_c
yx
k_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
...
@@ -170,7 +170,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -170,7 +170,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
// c_mtx[K,B] is out_block[K,B]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c
sr
k_block_desc
.
GetStride
(
I0
)
>
{});
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c
yx
k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
...
@@ -210,7 +210,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -210,7 +210,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
in_cb_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
in_cb_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
unsigned
wei_block_size
=
constexpr
unsigned
wei_block_size
=
wei_c
sr
k_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
wei_c
yx
k_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
?
InBlockCopyDataPerRead
...
@@ -227,14 +227,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -227,14 +227,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c
sr
k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c
yx
k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
// preload data into LDS
// preload data into LDS
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block_0
);
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block_0
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block_0
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block_0
);
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
p_wei_global_block_offset
+=
CPerBlock
*
wei_c
sr
k_global_desc
.
GetStride
(
I0
);
p_wei_global_block_offset
+=
CPerBlock
*
wei_c
yx
k_global_desc
.
GetStride
(
I0
);
// register
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
...
@@ -247,7 +247,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -247,7 +247,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
+
CPerBlock
<
C
;
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
+
CPerBlock
<
C
;
c_block_data_begin
+=
CPerBlock
,
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_c
sr
k_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_c
yx
k_global_desc
.
GetStride
(
I0
),
even_loop
=
!
even_loop
)
even_loop
=
!
even_loop
)
{
{
Float
*
p_in_block_now
=
even_loop
?
p_in_block_0
:
p_in_block_1
;
Float
*
p_in_block_now
=
even_loop
?
p_in_block_0
:
p_in_block_1
;
...
@@ -275,9 +275,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -275,9 +275,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// compute on current data
// compute on current data
// a series of GEMM
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
for
(
unsigned
y
=
0
;
y
<
Y
;
++
y
)
{
{
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
for
(
unsigned
x
=
0
;
x
<
X
;
++
x
)
{
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if 1
#if 1
...
@@ -285,8 +285,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -285,8 +285,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
#else
#else
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#endif
#endif
(
p_wei_block_now
+
wei_c
sr
k_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
(
p_wei_block_now
+
wei_c
yx
k_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_in_block_now
+
y
*
Wi
+
x
,
p_out_thread
,
p_out_thread
,
f_accum
);
f_accum
);
}
}
...
@@ -305,9 +305,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -305,9 +305,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
__syncthreads
();
__syncthreads
();
for
(
unsigned
s
=
0
;
s
<
Y
;
++
s
)
for
(
unsigned
y
=
0
;
y
<
Y
;
++
y
)
{
{
for
(
unsigned
r
=
0
;
r
<
X
;
++
r
)
for
(
unsigned
x
=
0
;
x
<
X
;
++
x
)
{
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if 0
#if 0
...
@@ -315,8 +315,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
...
@@ -315,8 +315,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
#else
#else
blockwise_gemm
.
Run_RegisterDoubleBuffer
blockwise_gemm
.
Run_RegisterDoubleBuffer
#endif
#endif
(
p_wei_block_now
+
wei_c
sr
k_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
(
p_wei_block_now
+
wei_c
yx
k_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_in_block_now
+
y
*
Wi
+
x
,
p_out_thread
,
p_out_thread
,
f_accum
);
f_accum
);
}
}
...
...
src/include/tensor.hpp
View file @
ce0182ce
...
@@ -8,16 +8,16 @@
...
@@ -8,16 +8,16 @@
#include <iostream>
#include <iostream>
template
<
class
Range
>
template
<
class
Range
>
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
r
,
std
::
string
delim
)
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
r
ange
,
std
::
string
delim
)
{
{
bool
first
=
true
;
bool
first
=
true
;
for
(
auto
&&
x
:
r
)
for
(
auto
&&
v
:
r
ange
)
{
{
if
(
first
)
if
(
first
)
first
=
false
;
first
=
false
;
else
else
os
<<
delim
;
os
<<
delim
;
os
<<
x
;
os
<<
v
;
}
}
return
os
;
return
os
;
}
}
...
...
src/include/threadwise_direct_convolution.hip.hpp
View file @
ce0182ce
...
@@ -38,16 +38,16 @@ __device__ void threadwise_direct_convolution_1(InDesc,
...
@@ -38,16 +38,16 @@ __device__ void threadwise_direct_convolution_1(InDesc,
{
{
for
(
unsigned
c
=
0
;
c
<
wei_desc
.
GetLength
(
I1
);
++
c
)
for
(
unsigned
c
=
0
;
c
<
wei_desc
.
GetLength
(
I1
);
++
c
)
{
{
for
(
unsigned
s
=
0
;
s
<
wei_desc
.
GetLength
(
I2
);
++
s
)
for
(
unsigned
y
=
0
;
y
<
wei_desc
.
GetLength
(
I2
);
++
y
)
{
{
for
(
unsigned
r
=
0
;
r
<
wei_desc
.
GetLength
(
I3
);
++
r
)
for
(
unsigned
x
=
0
;
x
<
wei_desc
.
GetLength
(
I3
);
++
x
)
{
{
const
unsigned
hi
=
ho
+
s
;
const
unsigned
hi
=
ho
+
y
;
const
unsigned
wi
=
wo
+
r
;
const
unsigned
wi
=
wo
+
x
;
const
unsigned
in_index
=
in_desc
.
Get1dIndex
(
n
,
c
,
hi
,
wi
);
const
unsigned
in_index
=
in_desc
.
Get1dIndex
(
n
,
c
,
hi
,
wi
);
const
unsigned
wei_index
=
wei_desc
.
Get1dIndex
(
k
,
c
,
s
,
r
);
const
unsigned
wei_index
=
wei_desc
.
Get1dIndex
(
k
,
c
,
y
,
x
);
const
unsigned
out_index
=
out_desc
.
Get1dIndex
(
n
,
k
,
ho
,
wo
);
const
unsigned
out_index
=
out_desc
.
Get1dIndex
(
n
,
k
,
ho
,
wo
);
...
@@ -153,18 +153,18 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -153,18 +153,18 @@ __device__ void threadwise_direct_convolution_3(InDesc,
#if 0
#if 0
// this verison reused old input data in register, and read new data from LDS
// this verison reused old input data in register, and read new data from LDS
// loop over vertical direction
// loop over vertical direction
for(unsigned
s
= 0;
s
< wei_desc.GetLength(I2); ++
s
)
for(unsigned
y
= 0;
y
< wei_desc.GetLength(I2); ++
y
)
{
{
// read first input
// read first input
threadwise_4d_tensor_copy(in_desc,
threadwise_4d_tensor_copy(in_desc,
p_in + in_desc.Get1dIndex(0, 0,
s
, 0),
p_in + in_desc.Get1dIndex(0, 0,
y
, 0),
in_reg_desc,
in_reg_desc,
p_in_reg,
p_in_reg,
in_reg_desc.GetLengths());
in_reg_desc.GetLengths());
// read first 1x1 weight
// read first 1x1 weight
threadwise_4d_tensor_copy(wei_desc,
threadwise_4d_tensor_copy(wei_desc,
p_wei + wei_desc.Get1dIndex(0, 0,
s
, 0),
p_wei + wei_desc.Get1dIndex(0, 0,
y
, 0),
wei_reg_desc,
wei_reg_desc,
p_wei_reg,
p_wei_reg,
wei_reg_desc.GetLengths());
wei_reg_desc.GetLengths());
...
@@ -174,11 +174,11 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -174,11 +174,11 @@ __device__ void threadwise_direct_convolution_3(InDesc,
in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
// loop over horizontal direction
// loop over horizontal direction
for(unsigned
r
= 1;
r
< wei_desc.GetLength(I3); ++
r
)
for(unsigned
x
= 1;
x
< wei_desc.GetLength(I3); ++
x
)
{
{
// read new weight
// read new weight
threadwise_4d_tensor_copy(wei_desc,
threadwise_4d_tensor_copy(wei_desc,
p_wei + wei_desc.Get1dIndex(0, 0,
s
,
r
),
p_wei + wei_desc.Get1dIndex(0, 0,
y
,
x
),
wei_reg_desc,
wei_reg_desc,
p_wei_reg,
p_wei_reg,
wei_reg_desc.GetLengths());
wei_reg_desc.GetLengths());
...
@@ -189,7 +189,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -189,7 +189,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
// read new input
// read new input
threadwise_4d_tensor_copy(
threadwise_4d_tensor_copy(
in_desc,
in_desc,
p_in + in_desc.Get1dIndex(0, 0,
s
,
r
+ in_reg_desc.GetLength(I3) - 1),
p_in + in_desc.Get1dIndex(0, 0,
y
,
x
+ in_reg_desc.GetLength(I3) - 1),
in_reg_desc,
in_reg_desc,
p_in_reg +
p_in_reg +
in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
...
@@ -203,21 +203,21 @@ __device__ void threadwise_direct_convolution_3(InDesc,
...
@@ -203,21 +203,21 @@ __device__ void threadwise_direct_convolution_3(InDesc,
#elif
1
#elif
1
// this version read all input from LDS when filter moves
// this version read all input from LDS when filter moves
// loop over vertical direction
// loop over vertical direction
for
(
unsigned
s
=
0
;
s
<
wei_desc
.
GetLength
(
I2
);
++
s
)
for
(
unsigned
y
=
0
;
y
<
wei_desc
.
GetLength
(
I2
);
++
y
)
{
{
// loop over horizontal direction
// loop over horizontal direction
for
(
unsigned
r
=
0
;
r
<
wei_desc
.
GetLength
(
I3
);
++
r
)
for
(
unsigned
x
=
0
;
x
<
wei_desc
.
GetLength
(
I3
);
++
x
)
{
{
// read new weight
// read new weight
threadwise_4d_tensor_copy
(
wei_desc
,
threadwise_4d_tensor_copy
(
wei_desc
,
p_wei
+
wei_desc
.
Get1dIndex
(
0
,
0
,
s
,
r
),
p_wei
+
wei_desc
.
Get1dIndex
(
0
,
0
,
y
,
x
),
wei_reg_desc
,
wei_reg_desc
,
p_wei_reg
,
p_wei_reg
,
wei_reg_desc
.
GetLengths
());
wei_reg_desc
.
GetLengths
());
// read new input
// read new input
threadwise_4d_tensor_copy
(
in_desc
,
threadwise_4d_tensor_copy
(
in_desc
,
p_in
+
in_desc
.
Get1dIndex
(
0
,
0
,
s
,
r
),
p_in
+
in_desc
.
Get1dIndex
(
0
,
0
,
y
,
x
),
in_reg_desc
,
in_reg_desc
,
p_in_reg
,
p_in_reg
,
in_reg_desc
.
GetLengths
());
in_reg_desc
.
GetLengths
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment