Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
fdaaaa50
Commit
fdaaaa50
authored
Mar 22, 2019
by
Chao Liu
Browse files
Merge branch 'direct_fp16'
parents
2c9b8c24
18a81e35
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1913 additions
and
585 deletions
+1913
-585
driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
+53
-29
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
...device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+211
-0
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
+35
-2
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+38
-33
driver/driver.hip.cpp
driver/driver.hip.cpp
+134
-99
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+18
-0
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+49
-266
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+92
-0
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+4
-3
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+110
-14
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+104
-13
src/include/common.hip.hpp
src/include/common.hip.hpp
+5
-123
src/include/config.h.in
src/include/config.h.in
+2
-1
src/include/constant_integral.hip.hpp
src/include/constant_integral.hip.hpp
+12
-0
src/include/data_type.hip.hpp
src/include/data_type.hip.hpp
+214
-0
src/include/functional.hip.hpp
src/include/functional.hip.hpp
+57
-0
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
...lude/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+238
-0
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
...se_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+254
-0
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+281
-0
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...mm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+2
-2
No files found.
driver/device_direct_convolution_2.hpp
→
driver/device_direct_convolution_2
_nchw_kcyx_nkhw
.hpp
View file @
fdaaaa50
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_direct_convolution_2.hip.hpp"
#include "gridwise_direct_convolution_2
_nchw_kcyx_nkhw
.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2
(
InDesc
,
void
device_direct_convolution_2
_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
OutDesc
,
Tensor
<
T
>&
out
,
Tensor
<
T
>&
out
,
unsigned
nrepeat
)
unsigned
nrepeat
)
{
{
std
::
size_t
data_sz
=
sizeof
(
T
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
...
@@ -46,6 +46,27 @@ void device_direct_convolution_2(InDesc,
...
@@ -46,6 +46,27 @@ void device_direct_convolution_2(InDesc,
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// 3x3, 34x34, 128 thread, fp16
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
@@ -57,27 +78,30 @@ void device_direct_convolution_2(InDesc,
...
@@ -57,27 +78,30 @@ void device_direct_convolution_2(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
float
time
=
launch_kernel
(
gridwise_direct_convolution_2
<
T
,
float
time
=
InDesc
,
launch_kernel
(
gridwise_direct_convolution_2_nchw_kcyx_nkhw
<
T
,
WeiDesc
,
InDesc
,
OutDesc
,
WeiDesc
,
NPerBlock
,
OutDesc
,
KPerBlock
,
NPerBlock
,
CPerBlock
,
KPerBlock
,
HoPerBlock
,
CPerBlock
,
WoPerBlock
,
HoPerBlock
,
NPerThread
,
WoPerBlock
,
KPerThread
,
NPerThread
,
CPerThread
,
KPerThread
,
HoPerThread
,
CPerThread
,
WoPerThread
,
HoPerThread
,
BlockSize
,
WoPerThread
,
GridSize
>
,
InBlockCopyDataPerRead
,
dim3
(
GridSize
),
WeiBlockCopyDataPerRead
,
dim3
(
BlockSize
),
BlockSize
,
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
GridSize
>
,
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
dim3
(
GridSize
),
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
...
...
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
template
<
class
TInWei
,
class
TOut
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
TInWei
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
TInWei
>&
wei_kcyx
,
OutDesc
,
Tensor
<
TOut
>&
out_nkhw
,
unsigned
nrepeat
)
{
// this suppose in / wei data type is int8x4
constexpr
unsigned
NVector
=
4
;
using
accum_t
=
int32_t
;
using
vector_t
=
vector_type
<
TInWei
,
NVector
>
;
using
vector_mem_t
=
typename
vector_t
::
MemoryType
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
unsigned
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// vectorized input
auto
in_nchw_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
/
NVector
,
Hi
,
Wi
>
{});
ostream_ConstantTensorDescriptor
(
in_nchw_vec_desc
,
std
::
cout
<<
"in_nchw_vec_desc: "
);
Tensor
<
vector_mem_t
>
in_nchw_vec
(
make_TensorDescriptor
(
in_nchw_vec_desc
));
auto
f_vectorized_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
h
,
auto
w
)
{
#if 0
in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
#elif
0
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_t
::
Pack
(
in_nchw
(
n
,
2
*
c
,
h
,
w
),
in_nchw
(
n
,
2
*
c
+
1
,
h
,
w
));
#elif 1
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_t
::
Pack
(
in_nchw
(
n
,
4
*
c
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
1
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
2
,
h
,
w
),
in_nchw
(
n
,
4
*
c
+
3
,
h
,
w
));
#endif
};
make_ParallelTensorFunctor
(
f_vectorized_nchw
,
N
,
C
/
NVector
,
Hi
,
Wi
)(
std
::
thread
::
hardware_concurrency
());
// vectorize weight
auto
wei_kcyx_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
/
NVector
,
Y
,
X
>
{});
ostream_ConstantTensorDescriptor
(
wei_kcyx_vec_desc
,
std
::
cout
<<
"wei_kcyx_vec_desc: "
);
Tensor
<
vector_mem_t
>
wei_kcyx_vec
(
make_TensorDescriptor
(
wei_kcyx_vec_desc
));
auto
f_vectorized_kcyx
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
#if 0
wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
#elif
0
wei_kcyx_vec
(
k
,
c
,
y
,
x
)
=
vector_t
::
Pack
(
wei_kcyx
(
k
,
2
*
c
,
y
,
x
),
wei_kcyx
(
k
,
2
*
c
+
1
,
y
,
x
));
#elif 1
wei_kcyx_vec
(
k
,
c
,
y
,
x
)
=
vector_t
::
Pack
(
wei_kcyx
(
k
,
4
*
c
,
y
,
x
),
wei_kcyx
(
k
,
4
*
c
+
1
,
y
,
x
),
wei_kcyx
(
k
,
4
*
c
+
2
,
y
,
x
),
wei_kcyx
(
k
,
4
*
c
+
3
,
y
,
x
));
#endif
};
make_ParallelTensorFunctor
(
f_vectorized_kcyx
,
K
,
C
/
NVector
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
//
DeviceMem
in_nchw_vec_device_buf
(
sizeof
(
vector_mem_t
)
*
in_nchw_vec
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_vec_device_buf
(
sizeof
(
vector_mem_t
)
*
wei_kcyx_vec
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
sizeof
(
TOut
)
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_vec_device_buf
.
ToDevice
(
in_nchw_vec
.
mData
.
data
());
wei_kcyx_vec_device_buf
.
ToDevice
(
wei_kcyx_vec
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 0
// 3x3, 34x34, 128 thread, fp32, vector = 1
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128;
#elif
0
// 3x3, 34x34, 128 thread, fp32, vector = 2
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
// 3x3, 34x34, 128 thread, int8, vector = 4
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
1
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
4
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// 1x1, 32x32, 128 thread, int8, vector = 4
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
16
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
1
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
4
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#endif
constexpr
unsigned
GridSize
=
(
N
/
NPerBlock
)
*
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
<
TInWei
,
TOut
,
accum_t
,
decltype
(
in_nchw_vec_desc
),
decltype
(
wei_kcyx_vec_desc
),
decltype
(
out_nkhw_desc
),
NVector
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
,
BlockSize
,
GridSize
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
TInWei
*>
(
in_nchw_vec_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
wei_kcyx_vec_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
View file @
fdaaaa50
...
@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
...
@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
out_khwn_device_buf
.
ToDevice
(
out_khwn
.
mData
.
data
());
#if
1
#if
0
// for 3x3, 34x34
// for 3x3, 34x34
constexpr unsigned NPerBlock = 16;
constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 64;
constexpr unsigned KPerBlock = 64;
...
@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
...
@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
// for 1x1, 28x28
// for 1x1, 28x28
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
128
;
...
@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
...
@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr
unsigned
OutThreadCopyDataPerWrite
=
2
;
constexpr
unsigned
OutThreadCopyDataPerWrite
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// for 1x1, 14x14
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
2
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
InBlockCopy_ThreadPerDimC
=
8
;
constexpr
unsigned
InBlockCopy_ThreadPerDimH
=
2
;
constexpr
unsigned
InBlockCopy_ThreadPerDimW
=
2
;
constexpr
unsigned
InBlockCopy_ThreadPerDimN
=
4
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
GemmMPerThreadSubC
=
4
;
constexpr
unsigned
GemmNPerThreadSubC
=
4
;
constexpr
unsigned
GemmMLevel0Cluster
=
4
;
constexpr
unsigned
GemmNLevel0Cluster
=
2
;
constexpr
unsigned
GemmMLevel1Cluster
=
2
;
constexpr
unsigned
GemmNLevel1Cluster
=
4
;
constexpr
unsigned
GemmKPerThreadLoop
=
1
;
constexpr
unsigned
OutThreadCopyDataPerWrite
=
2
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
...
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
View file @
fdaaaa50
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
@@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
...
@@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
float
time
=
float
time
=
launch_kernel
(
launch_kernel
(
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
<
#if 0
GridSize
,
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
BlockSize
,
#else
T
,
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
decltype
(
in_chwn_desc
),
#endif
decltype
(
wei_cyxk_desc
),
<
GridSize
,
decltype
(
out_khwn_desc
),
BlockSize
,
BPerBlock
,
T
,
KPerBlock
,
decltype
(
in_chwn_desc
),
CPerBlock
,
decltype
(
wei_cyxk_desc
),
BPerThread
,
decltype
(
out_khwn_desc
),
KPerThread
,
BPerBlock
,
GemmThreadPerColumnPerCluster
,
KPerBlock
,
GemmThreadPerRowPerCluster
,
CPerBlock
,
GemmMPerThreadSubC
,
BPerThread
,
GemmNPerThreadSubC
,
KPerThread
,
GemmMLevel0Cluster
,
GemmThreadPerColumnPerCluster
,
GemmNLevel0Cluster
,
GemmThreadPerRowPerCluster
,
GemmMLevel1Cluster
,
GemmMPerThreadSubC
,
GemmNLevel1Cluster
,
GemmNPerThreadSubC
,
GemmKPerThreadLoop
,
GemmMLevel0Cluster
,
InBlockCopyThreadPerDim0
,
GemmNLevel0Cluster
,
InBlockCopyThreadPerDim1
,
GemmMLevel1Cluster
,
WeiBlockCopyThreadPerDim0
,
GemmNLevel1Cluster
,
WeiBlockCopyThreadPerDim1
,
GemmKPerThreadLoop
,
InBlockCopyDataPerRead
,
InBlockCopyThreadPerDim0
,
WeiBlockCopyDataPerRead
>
,
InBlockCopyThreadPerDim1
,
dim3
(
GridSize
),
WeiBlockCopyThreadPerDim0
,
dim3
(
BlockSize
),
WeiBlockCopyThreadPerDim1
,
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
InBlockCopyDataPerRead
,
static_cast
<
T
*>
(
wei_cyxk_device_buf
.
GetDeviceBuffer
()),
WeiBlockCopyDataPerRead
>
,
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_chwn_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_cyxk_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_khwn_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
...
...
driver/driver.hip.cpp
View file @
fdaaaa50
...
@@ -7,10 +7,11 @@
...
@@ -7,10 +7,11 @@
#include "tensor.hpp"
#include "tensor.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "conv_common.hip.hpp"
#include "conv_common.hip.hpp"
#include "device_direct_convolution_1.hpp"
//#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2.hpp"
#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
//
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
...
@@ -34,25 +35,6 @@ struct GeneratorTensor_2
...
@@ -34,25 +35,6 @@ struct GeneratorTensor_2
}
}
};
};
struct
GeneratorTensor_3
{
template
<
class
...
Is
>
double
operator
()(
Is
...
is
)
{
#if 0
std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
#elif
1
assert
(
sizeof
...(
Is
)
>
0
);
std
::
initializer_list
<
std
::
size_t
>
ids
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
vector
<
std
::
size_t
>
lens
(
sizeof
...(
Is
),
100
);
std
::
vector
<
std
::
size_t
>
strides
(
sizeof
...(
Is
),
1
);
std
::
partial_sum
(
lens
.
rbegin
(),
lens
.
rbegin
()
+
(
sizeof
...(
Is
)
-
1
),
strides
.
rbegin
()
+
1
);
return
std
::
inner_product
(
ids
.
begin
(),
ids
.
end
(),
strides
.
begin
(),
std
::
size_t
(
0
))
+
1
;
#endif
}
};
struct
GeneratorTensor_Checkboard
struct
GeneratorTensor_Checkboard
{
{
template
<
class
...
Ts
>
template
<
class
...
Ts
>
...
@@ -106,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc)
...
@@ -106,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc)
return
TensorDescriptor
(
lengths
,
strides
);
return
TensorDescriptor
(
lengths
,
strides
);
}
}
template
<
class
T
,
class
LowerPads
,
class
UpperPads
>
template
<
class
TIn
,
class
TWei
,
class
TOut
,
class
LowerPads
,
class
UpperPads
>
void
host_direct_convolution
(
void
host_direct_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
wei_kcyx
,
Tensor
<
T
>&
out
,
LowerPads
,
UpperPads
)
const
Tensor
<
TWei
>&
wei_kcyx
,
Tensor
<
TOut
>&
out_nkhw
,
LowerPads
,
UpperPads
)
{
{
unsigned
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
unsigned
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
unsigned
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
unsigned
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
...
@@ -129,26 +114,29 @@ void host_direct_convolution(
...
@@ -129,26 +114,29 @@ void host_direct_convolution(
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
{
{
v
+=
in_nchw
(
n
,
c
,
hi
,
wi
)
*
wei_kcyx
(
k
,
c
,
y
,
x
);
v
+=
double
(
in_nchw
(
n
,
c
,
hi
,
wi
)
)
*
double
(
wei_kcyx
(
k
,
c
,
y
,
x
)
)
;
}
}
}
}
}
}
}
}
out
(
n
,
k
,
ho
,
wo
)
=
v
;
out
_nkhw
(
n
,
k
,
ho
,
wo
)
=
v
;
};
};
auto
f_par
=
make_ParallelTensorFunctor
(
f
,
auto
f_par
=
make_ParallelTensorFunctor
(
f
,
out
.
mDesc
.
GetLengths
()[
0
],
out
_nkhw
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
_nkhw
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
_nkhw
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
]);
out
_nkhw
.
mDesc
.
GetLengths
()[
3
]);
f_par
(
std
::
thread
::
hardware_concurrency
());
f_par
(
std
::
thread
::
hardware_concurrency
());
}
}
template
<
class
T
,
class
LowerPads
,
class
UpperPads
>
template
<
class
TIn
,
class
TWei
,
class
TOut
,
class
LowerPads
,
class
UpperPads
>
void
host_winograd_3x3_convolution
(
void
host_winograd_3x3_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
wei_kcyx
,
Tensor
<
T
>&
out
,
LowerPads
,
UpperPads
)
const
Tensor
<
TWei
>&
wei_kcyx
,
Tensor
<
TOut
>&
out_nkhw
,
LowerPads
,
UpperPads
)
{
{
constexpr
std
::
size_t
HoPerTile
=
2
;
constexpr
std
::
size_t
HoPerTile
=
2
;
constexpr
std
::
size_t
WoPerTile
=
2
;
constexpr
std
::
size_t
WoPerTile
=
2
;
...
@@ -162,8 +150,8 @@ void host_winograd_3x3_convolution(
...
@@ -162,8 +150,8 @@ void host_winograd_3x3_convolution(
std
::
size_t
Y
=
wei_kcyx
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Y
=
wei_kcyx
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
X
=
wei_kcyx
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
X
=
wei_kcyx
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
HO
=
out
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
HO
=
out
_nkhw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
WO
=
out
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
WO
=
out
_nkhw
.
mDesc
.
GetLengths
()[
3
];
unsigned
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
unsigned
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
unsigned
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
unsigned
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
...
@@ -177,11 +165,11 @@ void host_winograd_3x3_convolution(
...
@@ -177,11 +165,11 @@ void host_winograd_3x3_convolution(
std
::
size_t
HTile
=
(
HO
+
HoPerTile
-
1
)
/
HoPerTile
;
std
::
size_t
HTile
=
(
HO
+
HoPerTile
-
1
)
/
HoPerTile
;
std
::
size_t
WTile
=
(
WO
+
WoPerTile
-
1
)
/
WoPerTile
;
std
::
size_t
WTile
=
(
WO
+
WoPerTile
-
1
)
/
WoPerTile
;
Tensor
<
T
>
in_hold
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
in_hold
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
T
>
in_transform
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
in_transform
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
T
>
wei_transform
({
K
,
C
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
wei_transform
({
K
,
C
,
HiPerTile
,
WiPerTile
});
Tensor
<
T
>
out_transform
({
N
,
K
,
HTile
,
WTile
,
HiPerTile
,
HiPerTile
});
Tensor
<
double
>
out_transform
({
N
,
K
,
HTile
,
WTile
,
HiPerTile
,
HiPerTile
});
Tensor
<
T
>
out_hold
({
N
,
K
,
HTile
,
WTile
,
HoPerTile
,
WoPerTile
});
Tensor
<
double
>
out_hold
({
N
,
K
,
HTile
,
WTile
,
HoPerTile
,
WoPerTile
});
auto
f_in_hold
=
[
&
](
auto
n
,
auto
c
,
auto
htile
,
auto
wtile
)
{
auto
f_in_hold
=
[
&
](
auto
n
,
auto
c
,
auto
htile
,
auto
wtile
)
{
for
(
int
j
=
0
;
j
<
HiPerTile
;
++
j
)
for
(
int
j
=
0
;
j
<
HiPerTile
;
++
j
)
...
@@ -198,7 +186,7 @@ void host_winograd_3x3_convolution(
...
@@ -198,7 +186,7 @@ void host_winograd_3x3_convolution(
}
}
else
else
{
{
in_hold
(
n
,
c
,
htile
,
wtile
,
j
,
i
)
=
T
(
0
);
in_hold
(
n
,
c
,
htile
,
wtile
,
j
,
i
)
=
T
In
(
0
);
}
}
}
}
}
}
...
@@ -259,49 +247,61 @@ void host_winograd_3x3_convolution(
...
@@ -259,49 +247,61 @@ void host_winograd_3x3_convolution(
};
};
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
wei_transform
(
k
,
c
,
0
,
0
)
=
wei_kcyx
(
k
,
c
,
0
,
0
);
wei_transform
(
k
,
c
,
0
,
0
)
=
double
(
wei_kcyx
(
k
,
c
,
0
,
0
));
wei_transform
(
k
,
c
,
0
,
1
)
=
wei_transform
(
k
,
c
,
0
,
1
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
);
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
wei_transform
(
k
,
c
,
0
,
2
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
));
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
2
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
wei_transform
(
k
,
c
,
0
,
3
)
=
wei_kcyx
(
k
,
c
,
0
,
2
);
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
));
wei_transform
(
k
,
c
,
1
,
0
)
=
wei_transform
(
k
,
c
,
0
,
3
)
=
double
(
wei_kcyx
(
k
,
c
,
0
,
2
));
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
1
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
1
,
0
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
));
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
wei_transform
(
k
,
c
,
1
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
wei_transform
(
k
,
c
,
1
,
2
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
1
,
2
)
=
wei_transform
(
k
,
c
,
1
,
3
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
)
+
0.5
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
wei_transform
(
k
,
c
,
2
,
0
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
2
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
1
,
3
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
-
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
2
,
0
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
wei_transform
(
k
,
c
,
2
,
2
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
));
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
wei_transform
(
k
,
c
,
2
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
-
wei_transform
(
k
,
c
,
2
,
3
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
)
-
0.5
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
0
)
=
wei_kcyx
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
2
,
2
)
=
wei_transform
(
k
,
c
,
3
,
1
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
wei_transform
(
k
,
c
,
3
,
2
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
)
-
0.5
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
wei_transform
(
k
,
c
,
3
,
3
)
=
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
2
,
3
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
-
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
0
)
=
double
(
wei_kcyx
(
k
,
c
,
2
,
0
));
wei_transform
(
k
,
c
,
3
,
1
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
2
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
-
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
3
)
=
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
};
};
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
htile
,
auto
wtile
)
{
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
htile
,
auto
wtile
)
{
...
@@ -354,7 +354,7 @@ void host_winograd_3x3_convolution(
...
@@ -354,7 +354,7 @@ void host_winograd_3x3_convolution(
for
(
int
i
=
0
;
i
<
WoPerTile
;
++
i
)
for
(
int
i
=
0
;
i
<
WoPerTile
;
++
i
)
{
{
std
::
size_t
wo
=
WoPerTile
*
wtile
+
i
;
std
::
size_t
wo
=
WoPerTile
*
wtile
+
i
;
out
(
n
,
k
,
ho
,
wo
)
=
out_hold
(
n
,
k
,
htile
,
wtile
,
j
,
i
);
out
_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_hold
(
n
,
k
,
htile
,
wtile
,
j
,
i
);
}
}
}
}
};
};
...
@@ -372,20 +372,25 @@ void host_winograd_3x3_convolution(
...
@@ -372,20 +372,25 @@ void host_winograd_3x3_convolution(
template
<
class
T
>
template
<
class
T
>
void
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
void
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
{
// printf("\n");
float
error
=
0
;
float
error
=
0
;
float
max_diff
=
-
1
;
float
max_diff
=
-
1
;
float
ref_value
=
0
,
result_value
=
0
;
float
ref_value
=
0
,
result_value
=
0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
error
+=
std
::
abs
(
ref
.
mData
[
i
]
-
result
.
mData
[
i
]);
error
+=
std
::
abs
(
double
(
ref
.
mData
[
i
]
)
-
double
(
result
.
mData
[
i
])
)
;
float
diff
=
std
::
abs
(
ref
.
mData
[
i
]
-
result
.
mData
[
i
]);
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
]
)
-
double
(
result
.
mData
[
i
])
)
;
if
(
max_diff
<
diff
)
if
(
max_diff
<
diff
)
{
{
max_diff
=
diff
;
max_diff
=
diff
;
ref_value
=
ref
.
mData
[
i
];
ref_value
=
ref
.
mData
[
i
];
result_value
=
result
.
mData
[
i
];
result_value
=
result
.
mData
[
i
];
}
}
// printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i]));
}
}
// printf("\n");
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
...
@@ -404,7 +409,7 @@ int main(int argc, char* argv[])
...
@@ -404,7 +409,7 @@ int main(int argc, char* argv[])
constexpr unsigned HPad = 0;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
constexpr unsigned WPad = 0;
#elif
1
#elif
0
// 3x3, 34x34
// 3x3, 34x34
constexpr
unsigned
N
=
64
;
constexpr
unsigned
N
=
64
;
constexpr
unsigned
C
=
256
;
constexpr
unsigned
C
=
256
;
...
@@ -563,6 +568,30 @@ int main(int argc, char* argv[])
...
@@ -563,6 +568,30 @@ int main(int argc, char* argv[])
constexpr
unsigned
HPad
=
2
;
constexpr
unsigned
HPad
=
2
;
constexpr
unsigned
WPad
=
2
;
constexpr
unsigned
WPad
=
2
;
#elif 0
// 1x1 filter, 32x32 image
constexpr
unsigned
N
=
64
;
constexpr
unsigned
C
=
256
;
constexpr
unsigned
HI
=
32
;
constexpr
unsigned
WI
=
32
;
constexpr
unsigned
K
=
512
;
constexpr
unsigned
Y
=
1
;
constexpr
unsigned
X
=
1
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#elif 1
// 1x1 filter, 14x14 image
constexpr
unsigned
N
=
128
;
constexpr
unsigned
C
=
2048
;
constexpr
unsigned
HI
=
14
;
constexpr
unsigned
WI
=
14
;
constexpr
unsigned
K
=
512
;
constexpr
unsigned
Y
=
1
;
constexpr
unsigned
X
=
1
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#endif
#endif
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
...
@@ -577,10 +606,12 @@ int main(int argc, char* argv[])
...
@@ -577,10 +606,12 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
Tensor
<
float
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
using
in_data_t
=
float
;
Tensor
<
float
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
using
out_data_t
=
float
;
Tensor
<
float
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
float
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
@@ -601,9 +632,13 @@ int main(int argc, char* argv[])
...
@@ -601,9 +632,13 @@ int main(int argc, char* argv[])
#elif
1
#elif
1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 1
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
2
,
2
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
#endif
#endif
}
}
...
@@ -611,7 +646,9 @@ int main(int argc, char* argv[])
...
@@ -611,7 +646,9 @@ int main(int argc, char* argv[])
#if 0
#if 0
device_direct_convolution_1
device_direct_convolution_1
#elif
0
#elif
0
device_direct_convolution_2
device_direct_convolution_2_nchw_kcyx_nkhw
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 1
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0
#elif 0
...
@@ -633,7 +670,6 @@ int main(int argc, char* argv[])
...
@@ -633,7 +670,6 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if 1
if
(
Y
==
3
&&
X
==
3
)
if
(
Y
==
3
&&
X
==
3
)
{
{
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
...
@@ -643,7 +679,6 @@ int main(int argc, char* argv[])
...
@@ -643,7 +679,6 @@ int main(int argc, char* argv[])
host_direct_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_direct_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
}
}
check_error
(
out_nkhw_host
,
out_nkhw_device
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#endif
#if 0
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
...
...
src/include/Array.hip.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
template
<
class
TData
,
unsigned
NSize
>
struct
Array
{
using
Type
=
Array
<
TData
,
NSize
>
;
static
constexpr
unsigned
nSize
=
NSize
;
unsigned
mData
[
nSize
];
template
<
class
...
Xs
>
__host__
__device__
Array
(
Xs
...
xs
)
:
mData
{
static_cast
<
TData
>
(
xs
)...}
{
}
__host__
__device__
TData
operator
[](
unsigned
i
)
const
{
return
mData
[
i
];
}
};
src/include/ConstantTensorDescriptor.hip.hpp
View file @
fdaaaa50
...
@@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
...
@@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
template
<
class
Lengths
,
class
Strides
>
template
<
class
Lengths
,
class
Strides
>
struct
ConstantTensorDescriptor
struct
ConstantTensorDescriptor
{
{
using
Type
=
ConstantTensorDescriptor
<
Lengths
,
Strides
>
;
static
constexpr
unsigned
nDim
=
Lengths
::
nDim
;
static
constexpr
unsigned
nDim
=
Lengths
::
nDim
;
using
NDimConstant
=
Number
<
nDim
>
;
__host__
__device__
constexpr
ConstantTensorDescriptor
()
__host__
__device__
constexpr
ConstantTensorDescriptor
()
{
{
...
@@ -91,293 +91,70 @@ struct ConstantTensorDescriptor
...
@@ -91,293 +91,70 @@ struct ConstantTensorDescriptor
return
Strides
{}.
Get
(
Number
<
I
>
{});
return
Strides
{}.
Get
(
Number
<
I
>
{});
}
}
__host__
__device__
constexpr
unsigned
GetElementSize
()
const
// c++14 doesn't support constexpr lambdas, has to use this trick instead
struct
GetElementSize_f
{
{
static_assert
(
nDim
>=
2
&&
nDim
<=
8
,
"nDim"
);
template
<
class
IDim
>
__host__
__device__
constexpr
unsigned
operator
()(
IDim
idim
)
const
if
(
nDim
==
2
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
return
Type
{}.
GetLength
(
idim
);
constexpr
auto
I1
=
Number
<
1
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
);
}
}
else
if
(
nDim
==
3
)
};
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
);
__host__
__device__
constexpr
unsigned
GetElementSize
()
const
}
{
else
if
(
nDim
==
4
)
// c++14 doesn't support constexpr lambdas, has to use this trick instead
struct
multiply
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
__host__
__device__
constexpr
unsigned
operator
()(
unsigned
a
,
unsigned
b
)
const
constexpr
auto
I1
=
Number
<
1
>
{};
{
constexpr
auto
I2
=
Number
<
2
>
{};
return
a
*
b
;
constexpr
auto
I3
=
Number
<
3
>
{};
}
};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
);
return
static_const_reduce_n
<
nDim
>
{}(
GetElementSize_f
{},
multiply
{});
}
}
else
if
(
nDim
==
5
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
)
*
GetLength
(
I4
);
// c++14 doesn't support constexpr lambdas, has to use this trick instead
}
struct
GetElementSpace_f
else
if
(
nDim
==
6
)
{
{
template
<
class
IDim
>
constexpr
auto
I0
=
Number
<
0
>
{};
__host__
__device__
constexpr
unsigned
operator
()(
IDim
idim
)
const
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
)
*
GetLength
(
I4
)
*
GetLength
(
I5
);
}
else
if
(
nDim
==
7
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
)
*
GetLength
(
I4
)
*
GetLength
(
I5
)
*
GetLength
(
I6
);
}
else
if
(
nDim
==
8
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
)
*
GetLength
(
I4
)
*
GetLength
(
I5
)
*
GetLength
(
I6
)
*
GetLength
(
I7
);
}
else
{
{
assert
(
false
);
return
(
Type
{}.
GetLength
(
idim
)
-
1
)
*
Type
{}.
GetStride
(
idim
);
}
}
}
}
;
template
<
class
Align
=
Number
<
1
>
>
template
<
class
Align
=
Number
<
1
>
>
__host__
__device__
constexpr
unsigned
GetElementSpace
(
Align
align
=
Align
{})
const
__host__
__device__
constexpr
unsigned
GetElementSpace
(
Align
align
=
Align
{})
const
{
{
static_assert
(
nDim
>=
2
&&
nDim
<=
8
,
"nDim"
);
// c++14 doesn't support constexpr lambdas, has to use this trick instead
struct
add
constexpr
unsigned
align_size
=
align
.
Get
();
if
(
nDim
==
2
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
align_size
;
}
else
if
(
nDim
==
3
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
__host__
__device__
constexpr
unsigned
operator
()(
unsigned
a
,
unsigned
b
)
const
constexpr
auto
I1
=
Number
<
1
>
{};
{
constexpr
auto
I2
=
Number
<
2
>
{};
return
a
+
b
;
}
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
};
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
align_size
;
}
else
if
(
nDim
==
4
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
align_size
;
}
else
if
(
nDim
==
5
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
(
GetLength
(
I4
)
-
1
)
*
GetStride
(
I4
)
+
align_size
;
}
else
if
(
nDim
==
6
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
(
GetLength
(
I4
)
-
1
)
*
GetStride
(
I4
)
+
(
GetLength
(
I5
)
-
1
)
*
GetStride
(
I5
)
+
align_size
;
}
else
if
(
nDim
==
7
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
(
GetLength
(
I4
)
-
1
)
*
GetStride
(
I4
)
+
(
GetLength
(
I5
)
-
1
)
*
GetStride
(
I5
)
+
(
GetLength
(
I6
)
-
1
)
*
GetStride
(
I6
)
+
align_size
;
}
else
if
(
nDim
==
8
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
(
GetLength
(
I4
)
-
1
)
*
GetStride
(
I4
)
+
(
GetLength
(
I5
)
-
1
)
*
GetStride
(
I5
)
+
(
GetLength
(
I6
)
-
1
)
*
GetStride
(
I6
)
+
(
GetLength
(
I7
)
-
1
)
*
GetStride
(
I7
)
+
align_size
;
}
}
// this is ugly, only for 2d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_assert
(
nDim
==
2
,
"nDim is not 2"
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
);
}
// this is ugly, only for 3d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
static_assert
(
nDim
==
3
,
"nDim is not 3"
);
return
static_const_reduce_n
<
nDim
>
{}(
GetElementSpace_f
{},
add
{})
+
align
.
Get
();
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
);
}
}
// this is ugly, only for 4d
template
<
class
...
Is
>
__host__
__device__
unsigned
__host__
__device__
unsigned
Get1dIndex
(
Is
...
is
)
const
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
,
unsigned
i3
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
static_assert
(
sizeof
...(
Is
)
==
nDim
,
"number of multi-index is wrong"
);
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
);
}
// this is ugly, only for 5d
const
auto
multi_id
=
Array
<
unsigned
,
nDim
>
(
is
...);
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
,
unsigned
i3
,
unsigned
i4
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
static_assert
(
nDim
==
5
,
"nDim is not 5"
);
unsigned
id
=
0
;
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
)
+
i4
*
GetStride
(
I4
);
}
// this is ugly, only for 6d
static_loop_n
<
nDim
>
{}([
&
](
auto
IDim
)
{
__host__
__device__
unsigned
constexpr
unsigned
idim
=
IDim
.
Get
();
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
,
unsigned
i3
,
unsigned
i4
,
unsigned
i5
)
const
id
+=
multi_id
[
idim
]
*
GetStride
(
IDim
);
{
});
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
static_assert
(
nDim
==
6
,
"nDim is not 6"
);
return
id
;
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
)
+
i4
*
GetStride
(
I4
)
+
i5
*
GetStride
(
I5
);
}
// this is ugly, only for 7d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
,
unsigned
i3
,
unsigned
i4
,
unsigned
i5
,
unsigned
i6
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
static_assert
(
nDim
==
7
,
"nDim is not 7"
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
)
+
i4
*
GetStride
(
I4
)
+
i5
*
GetStride
(
I5
)
+
i6
*
GetStride
(
I6
);
}
// this is ugly, only for 8d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
,
unsigned
i3
,
unsigned
i4
,
unsigned
i5
,
unsigned
i6
,
unsigned
i7
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
static_assert
(
nDim
==
8
,
"nDim is not 8"
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
)
+
i3
*
GetStride
(
I3
)
+
i4
*
GetStride
(
I4
)
+
i5
*
GetStride
(
I5
)
+
i6
*
GetStride
(
I6
)
+
i7
*
GetStride
(
I7
);
}
}
__host__
__device__
constexpr
auto
Condense
()
const
__host__
__device__
constexpr
auto
Condense
()
const
...
@@ -385,6 +162,12 @@ struct ConstantTensorDescriptor
...
@@ -385,6 +162,12 @@ struct ConstantTensorDescriptor
constexpr
auto
default_strides
=
calculate_default_strides
(
Lengths
{});
constexpr
auto
default_strides
=
calculate_default_strides
(
Lengths
{});
return
ConstantTensorDescriptor
<
Lengths
,
decltype
(
default_strides
)
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
decltype
(
default_strides
)
>
{};
}
}
template
<
unsigned
IDim
,
unsigned
NVector
>
__host__
__device__
constexpr
auto
Vectorize
(
Number
<
IDim
>
,
Number
<
NVector
>
)
const
{
assert
(
false
);
// not implemented
}
};
};
template
<
class
Lengths
>
template
<
class
Lengths
>
...
...
src/include/Sequence.hip.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
#include "constant_integral.hip.hpp"
#include "functional.hip.hpp"
template
<
unsigned
...
Is
>
struct
Sequence
{
using
Type
=
Sequence
<
Is
...
>
;
static
constexpr
unsigned
nDim
=
sizeof
...(
Is
);
const
unsigned
mData
[
nDim
]
=
{
Is
...};
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
Get
(
Number
<
I
>
)
const
{
return
mData
[
I
];
}
// this is ugly, only for nDIm = 4
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
ReorderByGetNewFromOld
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
{
static_assert
(
nDim
==
4
,
"nDim != 4"
);
constexpr
auto
old_sequence
=
Type
{};
constexpr
unsigned
NR0
=
old_sequence
.
mData
[
I0
];
constexpr
unsigned
NR1
=
old_sequence
.
mData
[
I1
];
constexpr
unsigned
NR2
=
old_sequence
.
mData
[
I2
];
constexpr
unsigned
NR3
=
old_sequence
.
mData
[
I3
];
return
Sequence
<
NR0
,
NR1
,
NR2
,
NR3
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
ReorderByPutOldToNew
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
{
// don't know how to implement this
printf
(
"Sequence::ReorderByPutOldToNew not implemented"
);
assert
(
false
);
}
template
<
unsigned
I
>
__host__
__device__
constexpr
auto
PushBack
(
Number
<
I
>
)
const
{
return
Sequence
<
Is
...,
I
>
{};
}
__host__
__device__
constexpr
auto
PopBack
()
const
;
template
<
class
F
>
__host__
__device__
constexpr
auto
Transform
(
F
f
)
const
{
return
Sequence
<
f
(
Is
)...
>
{};
}
};
template
<
unsigned
...
Is
,
unsigned
I
>
__host__
__device__
constexpr
auto
sequence_pop_back
(
Sequence
<
Is
...,
I
>
)
{
static_assert
(
sizeof
...(
Is
)
>=
1
,
"empty Sequence!"
);
return
Sequence
<
Is
...
>
{};
}
template
<
class
F
,
unsigned
...
Xs
,
unsigned
...
Ys
>
__host__
__device__
constexpr
auto
sequence_sequence_op
(
Sequence
<
Xs
...
>
,
Sequence
<
Ys
...
>
,
F
f
)
{
static_assert
(
Sequence
<
Xs
...
>::
nDim
==
Sequence
<
Ys
...
>::
nDim
,
"Dim not the same"
);
return
Sequence
<
f
(
Xs
,
Ys
)...
>
{};
}
template
<
unsigned
...
Xs
,
unsigned
...
Ys
>
__host__
__device__
constexpr
auto
sequence_sequence_add
(
Sequence
<
Xs
...
>
,
Sequence
<
Ys
...
>
)
{
struct
add
{
__host__
__device__
constexpr
unsigned
operator
()(
unsigned
x
,
unsigned
y
)
const
{
return
x
+
y
;
}
};
return
sequence_sequence_op
(
Sequence
<
Xs
...
>
{},
Sequence
<
Ys
...
>
{},
add
{});
}
template
<
unsigned
...
Is
>
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
PopBack
()
const
{
return
sequence_pop_back
(
Type
{});
}
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
fdaaaa50
...
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
...
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise2dTensorCopy3
struct
Blockwise2dTensorCopy3
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
t
ype
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryT
ype
;
unsigned
mSrcMyThreadOffset
;
unsigned
mSrcMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
...
@@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3
...
@@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_assert
(
SrcDesc
{}.
GetStride
(
I1
)
==
1
&&
DstDesc
{}.
GetStride
(
I1
)
==
1
,
static_assert
(
DataPerRead
==
1
||
"wrong! only support stride1 == 1!
\n
"
);
(
SrcDesc
{}.
GetStride
(
I1
)
==
1
&&
DstDesc
{}.
GetStride
(
I1
)
==
1
),
"wrong! only support stride1 == 1 if DataPerRead > 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
fdaaaa50
...
@@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
unsigned
a
index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
src_
index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
b
index
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
const
unsigned
dst_
index
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
f
(
p_src
[
a
index
],
p_dst
[
b
index
]);
f
(
p_src
[
src_
index
],
p_dst
[
dst_
index
]);
}
}
constexpr
bool
has_tail
=
(
ref_desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
constexpr
bool
has_tail
=
(
ref_desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
...
@@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
unsigned
a
index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
src_
index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]);
const
unsigned
b
index
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
const
unsigned
dst_
index
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
f
(
p_src
[
a
index
],
p_dst
[
b
index
]);
f
(
p_src
[
src_
index
],
p_dst
[
dst_
index
]);
}
}
}
}
}
}
...
@@ -199,15 +199,110 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
...
@@ -199,15 +199,110 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
DstFromSrcReorder
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
DstFromSrcReorder
{},
f_copy
);
}
}
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
CopyLengths
,
unsigned
DataPerRead
>
struct
Blockwise4dTensorCopy1
struct
Blockwise4dTensorCopy1
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryType
;
__device__
constexpr
Blockwise4dTensorCopy1
()
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
DataPerRead
==
1
||
(
SrcDesc
{}.
GetStride
(
I3
)
==
1
&&
DstDesc
{}.
GetStride
(
I3
)
==
1
),
"wrong! only support stride3 == 1 if DataPerRead > 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
static_assert
(
SrcDesc
{}.
GetStride
(
I2
)
%
DataPerRead
==
0
&&
DstDesc
{}.
GetStride
(
I2
)
%
DataPerRead
==
0
,
"src and dst stride2 should be multiple of DataPerRead to keep alignment"
);
// we allow out-of-bound read from src in D3 dimension,
// but we need to make sure dst stride2 is big enough,
// so that the out-of-bound write won't contaminate next line in dst
constexpr
unsigned
L3
=
CopyLengths
{}.
Get
(
I3
);
constexpr
unsigned
read_per_d3
=
integer_divide_ceil
(
L3
,
DataPerRead
);
static_assert
(
read_per_d3
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I2
),
"wrong! out-of-bound write will contaminate next line!
\n
"
);
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
unsigned
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
unsigned
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
unsigned
L2
=
CopyLengths
{}.
Get
(
I2
);
constexpr
unsigned
L3
=
CopyLengths
{}.
Get
(
I3
);
constexpr
unsigned
read_per_d3
=
integer_divide_ceil
(
L3
,
DataPerRead
);
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
constexpr
auto
ref_desc
=
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
dst_from_src_reorder
);
make_ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
L2
,
read_per_d3
>
{});
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
auto
f_copy
=
[
&
](
unsigned
is
)
{
unsigned
did
[
4
];
did
[
0
]
=
is
/
ref_desc
.
GetStride
(
I0
);
is
-=
did
[
0
]
*
ref_desc
.
GetStride
(
I0
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
is
-=
did
[
1
]
*
ref_desc
.
GetStride
(
I1
);
did
[
2
]
=
is
/
ref_desc
.
GetStride
(
I2
);
is
-=
did
[
2
]
*
ref_desc
.
GetStride
(
I2
);
did
[
3
]
=
is
/
ref_desc
.
GetStride
(
I3
);
const
unsigned
src_index
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]
*
DataPerRead
);
const
unsigned
dst_index
=
dst_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
],
did
[
2
],
did
[
3
]
*
DataPerRead
);
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
};
for
(
unsigned
iloop
=
0
;
iloop
<
NLoop
;
++
iloop
)
{
unsigned
is
=
threadIdx
.
x
+
iloop
*
BlockSize
;
f_copy
(
is
);
}
constexpr
bool
has_tail
=
(
ref_desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
if
(
has_tail
)
{
unsigned
is
=
threadIdx
.
x
+
NLoop
*
BlockSize
;
if
(
is
<
ref_desc
.
GetElementSize
())
{
f_copy
(
is
);
}
}
}
}
};
};
...
@@ -350,7 +445,7 @@ template <unsigned BlockSize,
...
@@ -350,7 +445,7 @@ template <unsigned BlockSize,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise4dTensorCopy3
struct
Blockwise4dTensorCopy3
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
t
ype
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryT
ype
;
unsigned
mSrcMyThreadOffset
;
unsigned
mSrcMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
...
@@ -362,8 +457,9 @@ struct Blockwise4dTensorCopy3
...
@@ -362,8 +457,9 @@ struct Blockwise4dTensorCopy3
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
SrcDesc
{}.
GetStride
(
I3
)
==
1
&&
DstDesc
{}.
GetStride
(
I3
)
==
1
,
static_assert
(
DataPerRead
==
1
||
"wrong! only support stride3 == 1!
\n
"
);
(
SrcDesc
{}.
GetStride
(
I3
)
==
1
&&
DstDesc
{}.
GetStride
(
I3
)
==
1
),
"wrong! only support stride3 == 1 if DataPerRead > 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
...
@@ -371,7 +467,7 @@ struct Blockwise4dTensorCopy3
...
@@ -371,7 +467,7 @@ struct Blockwise4dTensorCopy3
static_assert
(
static_assert
(
SrcDesc
{}.
GetStride
(
I2
)
%
DataPerRead
==
0
&&
SrcDesc
{}.
GetStride
(
I2
)
%
DataPerRead
==
0
&&
DstDesc
{}.
GetStride
(
I2
)
%
DataPerRead
==
0
,
DstDesc
{}.
GetStride
(
I2
)
%
DataPerRead
==
0
,
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment"
);
"wrong! src and dst stride
2
should be multiple of DataPerRead to keep alignment"
);
constexpr
unsigned
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
unsigned
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
unsigned
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
unsigned
L1
=
CopyLengths
{}.
Get
(
I1
);
...
...
src/include/blockwise_gemm.hip.hpp
View file @
fdaaaa50
...
@@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
#pragma unroll
#pragma unroll
for
(
unsigned
k_begin
=
0
;
k_begin
<
KPerBlock
;
k_begin
+=
KPerThreadLoop
)
for
(
unsigned
k_begin
=
0
;
k_begin
<
KPerBlock
;
k_begin
+=
KPerThreadLoop
)
{
{
// read first batch of A, B
// read first batch of A, B
// copy A-sub to form A
// copy A-sub to form A
#pragma unroll
//
#pragma unroll
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
{
#if 0
threadwise_matrix_copy(
threadwise_matrix_copy(
a_block_mtx,
a_block_mtx,
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
...
@@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_mtx,
a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths());
a_thread_sub_mtx.GetLengths());
#else
for
(
unsigned
i
=
0
;
i
<
a_thread_mtx
.
NRow
();
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
a_thread_mtx
.
NCol
();
++
j
)
{
p_a_thread
[
a_thread_mtx
.
Get1dIndex
(
i
,
m_repeat
*
MPerThreadSubC
+
j
)]
=
p_a_block
[
a_block_mtx
.
Get1dIndex
(
k_begin
+
i
,
m_repeat
*
MPerLevel1Cluster
+
j
)
+
mMyThreadOffsetA
];
}
}
#endif
}
}
// copy B-sub to form B
// copy B-sub to form B
#pragma unroll
//
#pragma unroll
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
{
#if 0
threadwise_matrix_copy(
threadwise_matrix_copy(
b_block_mtx,
b_block_mtx,
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
...
@@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_mtx,
b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths());
b_thread_sub_mtx.GetLengths());
#else
for
(
unsigned
i
=
0
;
i
<
b_thread_mtx
.
NRow
();
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
b_thread_mtx
.
NCol
();
++
j
)
{
p_b_thread
[
b_thread_mtx
.
Get1dIndex
(
i
,
n_repeat
*
NPerThreadSubC
+
j
)]
=
p_b_block
[
b_block_mtx
.
Get1dIndex
(
k_begin
+
i
,
n_repeat
*
MPerLevel1Cluster
+
j
)
+
mMyThreadOffsetB
];
}
}
#endif
}
}
// loop over batch
// loop over batch
#pragma unroll
//
#pragma unroll
for
(
unsigned
ib
=
0
;
ib
+
1
<
BatchPerThread
;
++
ib
)
for
(
unsigned
ib
=
0
;
ib
+
1
<
BatchPerThread
;
++
ib
)
{
{
// do current batch of gemm
// do current batch of gemm
#if 0
threadwise_gemm(a_thread_mtx,
threadwise_gemm(a_thread_mtx,
True,
True,
p_a_thread,
p_a_thread,
...
@@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
False,
False,
p_c_thread + ib * ThreadMatrixStrideC,
p_c_thread + ib * ThreadMatrixStrideC,
f_accum);
f_accum);
#else
for
(
unsigned
k
=
0
;
k
<
a_thread_mtx
.
NRow
();
++
k
)
{
for
(
unsigned
i
=
0
;
i
<
c_thread_mtx
.
NRow
();
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
c_thread_mtx
.
NCol
();
++
j
)
{
const
unsigned
aindex
=
a_thread_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
unsigned
bindex
=
b_thread_mtx
.
Get1dIndex
(
k
,
j
);
const
unsigned
cindex
=
c_thread_mtx
.
Get1dIndex
(
i
,
j
)
+
ib
*
ThreadMatrixStrideC
;
f_accum
(
p_c_thread
[
cindex
],
p_a_thread
[
aindex
]
*
p_b_thread
[
bindex
]);
}
}
}
#endif
// read next batch of a, b
// read next batch of a, b
if
(
BlockMatrixStrideA
!=
0
)
if
(
BlockMatrixStrideA
!=
0
)
{
{
#pragma unroll
//
#pragma unroll
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
{
#if 0
threadwise_matrix_copy(
threadwise_matrix_copy(
a_block_mtx,
a_block_mtx,
p_a_block +
p_a_block +
...
@@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_mtx,
a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths());
a_thread_sub_mtx.GetLengths());
#else
for
(
unsigned
i
=
0
;
i
<
a_thread_mtx
.
NRow
();
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
a_thread_mtx
.
NCol
();
++
j
)
{
p_a_thread
[
a_thread_mtx
.
Get1dIndex
(
i
,
m_repeat
*
MPerThreadSubC
+
j
)]
=
p_a_block
[
a_block_mtx
.
Get1dIndex
(
k_begin
+
i
,
m_repeat
*
MPerLevel1Cluster
+
j
)
+
(
ib
+
1
)
*
BlockMatrixStrideA
+
mMyThreadOffsetA
];
}
}
#endif
}
}
}
}
if
(
BlockMatrixStrideB
!=
0
)
if
(
BlockMatrixStrideB
!=
0
)
{
{
#pragma unroll
//
#pragma unroll
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
{
#if 0
threadwise_matrix_copy(
threadwise_matrix_copy(
b_block_mtx,
b_block_mtx,
p_b_block +
p_b_block +
...
@@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_mtx,
b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths());
b_thread_sub_mtx.GetLengths());
#else
for
(
unsigned
i
=
0
;
i
<
b_thread_mtx
.
NRow
();
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
b_thread_mtx
.
NCol
();
++
j
)
{
p_b_thread
[
b_thread_mtx
.
Get1dIndex
(
i
,
n_repeat
*
NPerThreadSubC
+
j
)]
=
p_b_block
[
b_block_mtx
.
Get1dIndex
(
k_begin
+
i
,
n_repeat
*
MPerLevel1Cluster
+
j
)
+
(
ib
+
1
)
*
BlockMatrixStrideB
+
mMyThreadOffsetB
];
}
}
#endif
}
}
}
}
}
}
// do last batch of gemm
// do last batch of gemm
#if 0
threadwise_gemm(a_thread_mtx,
threadwise_gemm(a_thread_mtx,
True,
True,
p_a_thread,
p_a_thread,
...
@@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
False,
False,
p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
f_accum);
f_accum);
#else
for
(
unsigned
k
=
0
;
k
<
a_thread_mtx
.
NRow
();
++
k
)
{
for
(
unsigned
i
=
0
;
i
<
c_thread_mtx
.
NRow
();
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
c_thread_mtx
.
NCol
();
++
j
)
{
const
unsigned
aindex
=
a_thread_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
unsigned
bindex
=
b_thread_mtx
.
Get1dIndex
(
k
,
j
);
const
unsigned
cindex
=
c_thread_mtx
.
Get1dIndex
(
i
,
j
)
+
(
BatchPerThread
-
1
)
*
ThreadMatrixStrideC
;
f_accum
(
p_c_thread
[
cindex
],
p_a_thread
[
aindex
]
*
p_b_thread
[
bindex
]);
}
}
}
#endif
}
}
}
}
...
@@ -551,9 +642,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -551,9 +642,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
c_thread_mtx_begin
.
batch
*
BlockMatrixStrideC
+
c_thread_mtx_begin
.
batch
*
BlockMatrixStrideC
+
c_block_mtx
.
Get1dIndex
(
c_thread_mtx_begin
.
row
,
c_thread_mtx_begin
.
col
);
c_block_mtx
.
Get1dIndex
(
c_thread_mtx_begin
.
row
,
c_thread_mtx_begin
.
col
);
for
(
unsigned
m_repeat
=
0
;
m_repeat
,
MRepeat
;
++
m_repeat
)
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
{
for
(
unsigned
n_repeat
=
0
;
n_repeat
,
NRepeat
;
++
n_repeat
)
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
{
threadwise_matrix_copy
(
threadwise_matrix_copy
(
c_thread_sub_mtx
,
c_thread_sub_mtx
,
...
...
src/include/common.hip.hpp
View file @
fdaaaa50
#pragma once
#pragma once
#include "data_type.hip.hpp"
#include "constant_integral.hip.hpp"
#include "Sequence.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
__device__
unsigned
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
unsigned
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
...
@@ -16,129 +21,6 @@ struct is_same<T, T>
...
@@ -16,129 +21,6 @@ struct is_same<T, T>
static
const
bool
value
=
true
;
static
const
bool
value
=
true
;
};
};
template
<
class
T
,
unsigned
N
>
struct
vector_type
{
};
template
<
>
struct
vector_type
<
float
,
1
>
{
using
type
=
float
;
};
template
<
>
struct
vector_type
<
float
,
2
>
{
using
type
=
float2
;
};
template
<
>
struct
vector_type
<
float
,
4
>
{
using
type
=
float4
;
};
#if 0
template <>
struct vector_type<half_float::half, 1>
{
using type = half_float::half;
};
template <>
struct vector_type<half_float::half, 2>
{
using type = float;
};
template <>
struct vector_type<half_float::half, 4>
{
using type = float2;
};
template <>
struct vector_type<half_float::half, 8>
{
using type = float4;
};
#endif
#if 0
template <>
struct vector_type<half, 1>
{
using type = half;
};
template <>
struct vector_type<half, 2>
{
using type = half2;
};
template <>
struct vector_type<half, 4>
{
using type = float2;
};
template <>
struct vector_type<half, 8>
{
using type = float4;
};
#endif
template
<
class
T
,
T
N
>
struct
integral_constant
{
static
const
T
value
=
N
;
__host__
__device__
constexpr
T
Get
()
const
{
return
value
;
}
};
template
<
unsigned
N
>
using
Number
=
integral_constant
<
unsigned
,
N
>
;
template
<
unsigned
...
Is
>
struct
Sequence
{
using
Type
=
Sequence
<
Is
...
>
;
static
constexpr
unsigned
nDim
=
sizeof
...(
Is
);
const
unsigned
mData
[
nDim
]
=
{
Is
...};
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
Get
(
Number
<
I
>
)
const
{
return
mData
[
I
];
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
ReorderByGetNewFromOld
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
{
constexpr
auto
old_sequence
=
Type
{};
constexpr
unsigned
NR0
=
old_sequence
.
mData
[
I0
];
constexpr
unsigned
NR1
=
old_sequence
.
mData
[
I1
];
constexpr
unsigned
NR2
=
old_sequence
.
mData
[
I2
];
constexpr
unsigned
NR3
=
old_sequence
.
mData
[
I3
];
return
Sequence
<
NR0
,
NR1
,
NR2
,
NR3
>
{};
}
template
<
unsigned
I0
,
unsigned
I1
,
unsigned
I2
,
unsigned
I3
>
__host__
__device__
constexpr
auto
ReorderByPutOldToNew
(
Sequence
<
I0
,
I1
,
I2
,
I3
>
)
const
{
// don't know how to implement this
printf
(
"Sequence::ReorderByPutOldToNew not implemented"
);
assert
(
false
);
}
};
#if DEVICE_BACKEND_CUDA
#if DEVICE_BACKEND_CUDA
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
constexpr
T
max
(
T
a
,
T
b
)
__host__
__device__
constexpr
T
max
(
T
a
,
T
b
)
...
...
src/include/config.h.in
View file @
fdaaaa50
...
@@ -4,9 +4,10 @@
...
@@ -4,9 +4,10 @@
#if DEVICE_BACKEND_HIP
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#elif DEVICE_BACKEND_CUDA
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "helper_cuda.h"
#include "cuda_fp16.h"
#endif
#endif
src/include/constant_integral.hip.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
template
<
class
T
,
T
N
>
struct
integral_constant
{
static
const
T
value
=
N
;
__host__
__device__
constexpr
T
Get
()
const
{
return
value
;
}
};
template
<
unsigned
N
>
using
Number
=
integral_constant
<
unsigned
,
N
>
;
src/include/data_type.hip.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
#include "config.h"
template
<
class
T
,
unsigned
N
>
struct
vector_type
{
};
template
<
>
struct
vector_type
<
float
,
1
>
{
using
MemoryType
=
float
;
};
template
<
>
struct
vector_type
<
float
,
2
>
{
using
MemoryType
=
float2
;
__host__
__device__
static
MemoryType
Pack
(
float
s0
,
float
s1
)
{
union
{
MemoryType
vector
;
float
scalar
[
2
];
}
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
}
};
template
<
>
struct
vector_type
<
float
,
4
>
{
using
MemoryType
=
float4
;
};
template
<
>
struct
vector_type
<
float2
,
2
>
{
using
MemoryType
=
float4
;
};
#if 0
template <>
struct vector_type<half, 1>
{
using MemoryType = half;
__host__ __device__ static MemoryType Pack(half s) { return s; }
};
template <>
struct vector_type<half, 2>
{
using MemoryType = half2;
__host__ __device__ static MemoryType Pack(half s0, half s1)
{
union
{
MemoryType vector;
half scalar[2];
} data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<half, 4>
{
using MemoryType = float2;
};
template <>
struct vector_type<half, 8>
{
using MemoryType = float4;
};
template <>
struct vector_type<char, 1>
{
using MemoryType = char;
__host__ __device__ static MemoryType Pack(char s) { return s; }
};
template <>
struct vector_type<char, 2>
{
using MemoryType = int16_t;
__host__ __device__ static MemoryType Pack(char s0, char s1)
{
union
{
MemoryType vector;
char scalar[2];
} data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<char, 4>
{
using MemoryType = int32_t;
__host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
{
union
{
MemoryType vector;
char scalar[4];
} data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <>
struct vector_type<char, 8>
{
using MemoryType = int64_t;
};
template <>
struct vector_type<int32_t, 2>
{
using MemoryType = int64_t;
};
template <>
struct vector_type<char2, 2>
{
using MemoryType = char4;
};
template <>
struct vector_type<char2, 4>
{
using MemoryType = int64_t;
};
template <>
struct vector_type<char4, 1>
{
using MemoryType = int;
};
template <>
struct vector_type<char4, 2>
{
using MemoryType = int64_t;
};
#endif
__device__
void
fused_multiply_accumulate
(
float
&
d
,
const
float
&
s0
,
const
float
&
s1
)
{
d
+=
s0
*
s1
;
}
__device__
void
fused_multiply_accumulate
(
float
&
d
,
const
float2
&
s0
,
const
float2
&
s1
)
{
d
+=
s0
.
x
*
s1
.
x
;
d
+=
s0
.
y
*
s1
.
y
;
}
__device__
void
fused_multiply_accumulate
(
float
&
d
,
const
float4
&
s0
,
const
float4
&
s1
)
{
d
+=
s0
.
x
*
s1
.
x
;
d
+=
s0
.
y
*
s1
.
y
;
d
+=
s0
.
z
*
s1
.
z
;
d
+=
s0
.
w
*
s1
.
w
;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
// TODO:: this interface is misleading, s0, s1 are actually int8x4
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
#if DEVICE_BACKEND_CUDA
d = __dp4a(s0, s1, d);
#endif
}
#endif
src/include/functional.hip.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
#include "constant_integral.hip.hpp"
template
<
unsigned
NLoop
>
struct
static_loop_n
{
template
<
class
F
>
__host__
__device__
void
operator
()(
F
f
)
const
{
static_assert
(
NLoop
>
1
,
"out-of-range"
);
f
(
Number
<
NLoop
-
1
>
{});
static_loop_n
<
NLoop
-
1
>
{}(
f
);
}
};
template
<
>
struct
static_loop_n
<
1
>
{
template
<
class
F
>
__host__
__device__
void
operator
()(
F
f
)
const
{
f
(
Number
<
0
>
{});
}
};
template
<
unsigned
NLoop
>
struct
static_const_reduce_n
{
template
<
class
F
,
class
Reduce
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
Reduce
r
)
const
{
static_assert
(
NLoop
>
1
,
"out-of-range"
);
constexpr
auto
a
=
f
(
Number
<
NLoop
-
1
>
{});
auto
b
=
static_const_reduce_n
<
NLoop
-
1
>
{}(
f
,
r
);
// cannot use constexpr here, weird
return
r
(
a
,
b
);
}
};
template
<
>
struct
static_const_reduce_n
<
1
>
{
template
<
class
F
,
class
Reduce
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
Reduce
)
const
{
return
f
(
Number
<
0
>
{});
}
};
#if 0
template<class F>
__host__ __device__ constexpr auto unpacker(F f)
{
return [=](auto xs_array){ f(xs...); };
}
#endif
\ No newline at end of file
src/include/gridwise_direct_convolution_2.hip.hpp
→
src/include/gridwise_direct_convolution_2
_nchw_kcyx_nkhw
.hip.hpp
View file @
fdaaaa50
#pragma once
#pragma once
#include "common.hip.hpp"
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.hip.hpp"
#include "blockwise_direct_convolution.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
...
@@ -20,61 +21,86 @@ template <class Float,
...
@@ -20,61 +21,86 @@ template <class Float,
unsigned
CPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
,
unsigned
WoPerThread
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2
(
const
Float
*
const
__restrict__
p_in_global
,
__global__
void
const
Float
*
const
__restrict__
p_wei_global
,
gridwise_direct_convolution_2_nchw_kcyx_nkhw
(
const
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_out_global
)
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_
nchw_
global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_
kcyx_
global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_global_desc
=
OutGlobalDesc
{};
constexpr
auto
out_
nkhw_
global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
Y
=
wei_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
N
=
in_nchw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
X
=
wei_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kcyx_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcyx_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kcyx_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcyx_global_desc
.
GetLength
(
I3
);
constexpr
auto
wei_ke_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
*
Y
*
X
>
{});
// 2d view of wei for blockwise copy
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
auto
in_block_desc
=
constexpr
auto
in_nchw_block_desc
=
make_ConstantTensorDescriptor_aligned
(
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{});
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ke_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
KPerBlock
,
CPerBlock
*
Y
*
X
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// 2d view of wei for blockwise copy
constexpr
auto
wei_block_desc
=
constexpr
auto
wei_kcyx_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{});
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{},
Sequence
<
wei_ke_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
// shared mem
// shared mem
constexpr
unsigned
in_block_size
=
in_block_desc
.
GetElementSpace
();
constexpr
unsigned
in_block_size
=
constexpr
unsigned
wei_block_size
=
wei_block_desc
.
GetElementSpace
();
in_nchw_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
unsigned
wei_block_size
=
wei_kcyx_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
__shared__
Float
p_in_block
[
in_block_size
];
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
__shared__
Float
p_wei_block
[
wei_block_size
];
?
InBlockCopyDataPerRead
:
WeiBlockCopyDataPerRead
;
__shared__
Float
p_in_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
// threadwise tensors
// threadwise tensors
constexpr
unsigned
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
unsigned
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
unsigned
WiPerThread
=
WoPerThread
+
X
-
1
;
constexpr
unsigned
WiPerThread
=
WoPerThread
+
X
-
1
;
constexpr
auto
in_thread_block_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
in_nchw_thread_block_desc
=
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
in_block_desc
.
GetStrides
());
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
in_nchw_block_desc
.
GetStrides
());
constexpr
auto
wei_thread_block_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
wei_
kcyx_
thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_block_desc
.
GetStrides
());
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_
kcyx_
block_desc
.
GetStrides
());
constexpr
auto
out_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
constexpr
auto
out_
nkhw_
thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_thread_block_desc
,
wei_thread_block_desc
);
in_
nchw_
thread_block_desc
,
wei_
kcyx_
thread_block_desc
);
// register
// register
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_
nkhw_
thread_desc
.
GetElementSpace
()];
// divide block work
// divide block work
constexpr
unsigned
NBlockWork
=
(
out_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
NBlockWork
=
constexpr
unsigned
KBlockWork
=
(
out_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
(
out_nkhw_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
HBlockWork
=
(
out_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
KBlockWork
=
constexpr
unsigned
WBlockWork
=
(
out_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
(
out_nkhw_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
HBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
WBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
const
unsigned
block_id
=
blockIdx
.
x
;
const
unsigned
block_id
=
blockIdx
.
x
;
...
@@ -121,34 +147,46 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
...
@@ -121,34 +147,46 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
constexpr
auto
blockwise_in_copy
=
constexpr
auto
blockwise_in_copy
=
Blockwise4dTensorCopy1
<
BlockSize
,
Blockwise4dTensorCopy1
<
BlockSize
,
Float
,
Float
,
decltype
(
in_global_desc
),
decltype
(
in_nchw_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_nchw_block_desc
),
decltype
(
in_block_desc
.
GetLengths
())
>
{};
decltype
(
in_nchw_block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
#if 0
constexpr auto blockwise_wei_copy =
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Blockwise4dTensorCopy1<BlockSize,
Float,
Float,
decltype
(
wei_global_desc
),
decltype(wei_kcyx_global_desc),
decltype
(
wei_block_desc
),
decltype(wei_kcyx_block_desc),
decltype
(
wei_block_desc
.
GetLengths
())
>
{};
decltype(wei_kcyx_block_desc.GetLengths()),
1>{};
#elif
1
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_ke_global_desc
),
decltype
(
wei_ke_block_desc
),
decltype
(
wei_ke_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_thread_desc
,
p_out_thread
);
threadwise_4d_tensor_set_zero
(
out_
nkhw_
thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
in_global_desc
.
GetLength
(
I1
)
;
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
in_
nchw_
global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
wi_block_data_begin
),
p_in_block
);
p_in_block
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_wei_copy
.
Run
(
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_global
+
wei_kcyx_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_block
);
p_wei_block
);
__syncthreads
();
__syncthreads
();
...
@@ -158,27 +196,29 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
...
@@ -158,27 +196,29 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// threadwise convolution
// threadwise convolution
#if 1
#if 1
threadwise_direct_convolution_2
(
threadwise_direct_convolution_2
(
in_thread_block_desc
,
in_
nchw_
thread_block_desc
,
p_in_block
+
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
wei_thread_block_desc
,
wei_kcyx_thread_block_desc
,
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
p_wei_block
+
out_thread_desc
,
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
p_out_thread
);
p_out_thread
);
#elif 0
#elif 0
threadwise_direct_convolution_3
(
threadwise_direct_convolution_3
(
in_thread_block_desc
,
in_
nchw_
thread_block_desc
,
p_in_block
+
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
wei_thread_block_desc
,
wei_kcyx_thread_block_desc
,
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
p_wei_block
+
out_thread_desc
,
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
p_out_thread
);
p_out_thread
);
#endif
#endif
}
}
...
@@ -186,13 +226,13 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
...
@@ -186,13 +226,13 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// copy output tensor from register to global mem
// copy output tensor from register to global mem
threadwise_4d_tensor_copy
(
threadwise_4d_tensor_copy
(
out_thread_desc
,
out_
nkhw_
thread_desc
,
p_out_thread
,
p_out_thread
,
out_global_desc
,
out_
nkhw_
global_desc
,
p_out_global
+
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
out_
nkhw_
global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_thread_desc
.
GetLengths
());
out_
nkhw_
thread_desc
.
GetLengths
());
}
}
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.hip.hpp"
template
<
class
TInWei
,
class
TOut
,
class
TAccum
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
ScalarPerVector
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
HoPerBlock
,
unsigned
WoPerBlock
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
,
unsigned
BlockSize
,
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
const
typename
vector_type
<
TInWei
,
ScalarPerVector
>::
MemoryType
*
const
__restrict__
p_in_vec_global
,
const
typename
vector_type
<
TInWei
,
ScalarPerVector
>::
MemoryType
*
const
__restrict__
p_wei_vec_global
,
TOut
*
const
__restrict__
p_out_global
)
{
using
in_scalar_t
=
TInWei
;
using
in_vector_mem_t
=
typename
vector_type
<
in_scalar_t
,
ScalarPerVector
>::
MemoryType
;
using
out_scalar_t
=
TOut
;
using
accum_t
=
TAccum
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_vec_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_kcyx_vec_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_nkhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
N
=
in_nchw_vec_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
K
=
wei_kcyx_vec_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcyx_vec_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kcyx_vec_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcyx_vec_global_desc
.
GetLength
(
I3
);
constexpr
auto
wei_ke_vec_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
*
Y
*
X
>
{});
// 2d view of wei for blockwise copy
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
auto
in_nchw_vec_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ke_vec_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
KPerBlock
,
CPerBlock
*
Y
*
X
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// 2d view of wei for blockwise copy
constexpr
auto
wei_kcyx_vec_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{},
Sequence
<
wei_ke_vec_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
// shared mem
constexpr
unsigned
in_block_size
=
in_nchw_vec_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
unsigned
wei_block_size
=
wei_kcyx_vec_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
:
WeiBlockCopyDataPerRead
;
__shared__
in_vector_mem_t
p_in_vec_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
in_vector_mem_t
p_wei_vec_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
// threadwise tensors
constexpr
unsigned
HiPerThread
=
HoPerThread
+
Y
-
1
;
constexpr
unsigned
WiPerThread
=
WoPerThread
+
X
-
1
;
constexpr
auto
in_nchw_vec_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
in_nchw_vec_block_desc
.
GetStrides
());
constexpr
auto
wei_kcyx_vec_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_kcyx_vec_block_desc
.
GetStrides
());
constexpr
auto
out_nkhw_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_nchw_vec_thread_block_desc
,
wei_kcyx_vec_thread_block_desc
);
// register
out_scalar_t
p_out_thread
[
out_nkhw_thread_desc
.
GetElementSpace
()];
// divide block work
constexpr
unsigned
NBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
HBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
WBlockWork
=
(
out_nkhw_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
const
unsigned
block_id
=
blockIdx
.
x
;
unsigned
itmp
=
block_id
;
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
HBlockWork
*
WBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
HBlockWork
*
WBlockWork
);
itmp
-=
k_block_work_id
*
(
HBlockWork
*
WBlockWork
);
const
unsigned
h_block_work_id
=
itmp
/
WBlockWork
;
const
unsigned
w_block_work_id
=
itmp
-
h_block_work_id
*
WBlockWork
;
const
unsigned
n_block_data_begin
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
ho_block_data_begin
=
h_block_work_id
*
HoPerBlock
;
const
unsigned
wo_block_data_begin
=
w_block_work_id
*
WoPerBlock
;
const
unsigned
hi_block_data_begin
=
ho_block_data_begin
;
// minus padding
const
unsigned
wi_block_data_begin
=
wo_block_data_begin
;
// minus padding
// divide thread work
constexpr
unsigned
NThreadWork
=
(
NPerBlock
+
NPerThread
-
1
)
/
NPerThread
;
constexpr
unsigned
KThreadWork
=
(
KPerBlock
+
KPerThread
-
1
)
/
KPerThread
;
constexpr
unsigned
HThreadWork
=
(
HoPerBlock
+
HoPerThread
-
1
)
/
HoPerThread
;
constexpr
unsigned
WThreadWork
=
(
WoPerBlock
+
WoPerThread
-
1
)
/
WoPerThread
;
const
unsigned
thread_id
=
threadIdx
.
x
;
itmp
=
thread_id
;
const
unsigned
n_thread_work_id
=
itmp
/
(
KThreadWork
*
HThreadWork
*
WThreadWork
);
itmp
-=
n_thread_work_id
*
(
KThreadWork
*
HThreadWork
*
WThreadWork
);
const
unsigned
k_thread_work_id
=
itmp
/
(
HThreadWork
*
WThreadWork
);
itmp
-=
k_thread_work_id
*
(
HThreadWork
*
WThreadWork
);
const
unsigned
h_thread_work_id
=
itmp
/
WThreadWork
;
const
unsigned
w_thread_work_id
=
itmp
-
h_thread_work_id
*
WThreadWork
;
const
unsigned
n_thread_data_begin
=
n_thread_work_id
*
NPerThread
;
const
unsigned
k_thread_data_begin
=
k_thread_work_id
*
KPerThread
;
const
unsigned
ho_thread_data_begin
=
h_thread_work_id
*
HoPerThread
;
const
unsigned
wo_thread_data_begin
=
w_thread_work_id
*
WoPerThread
;
const
unsigned
hi_thread_data_begin
=
ho_thread_data_begin
;
const
unsigned
wi_thread_data_begin
=
wo_thread_data_begin
;
constexpr
auto
blockwise_in_copy
=
Blockwise4dTensorCopy1
<
BlockSize
,
in_vector_mem_t
,
decltype
(
in_nchw_vec_global_desc
),
decltype
(
in_nchw_vec_block_desc
),
decltype
(
in_nchw_vec_block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
#if 0
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
in_vector_mem_t,
decltype(wei_kcyx_vec_global_desc),
decltype(wei_kcyx_vec_block_desc),
decltype(wei_kcyx_vec_block_desc.GetLengths()),
1>{};
#elif
1
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
in_vector_mem_t
,
decltype
(
wei_ke_vec_global_desc
),
decltype
(
wei_ke_vec_block_desc
),
decltype
(
wei_ke_vec_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
#if 1 // debug
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_nkhw_thread_desc
,
p_out_thread
);
#endif
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_vec_global
+
in_nchw_vec_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
p_in_vec_block
);
// copy weight tensor to LDS
blockwise_wei_copy
.
Run
(
p_wei_vec_global
+
wei_kcyx_vec_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_vec_block
);
__syncthreads
();
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
// threadwise convolution
#if 1
threadwise_direct_convolution_2
(
in_nchw_vec_thread_block_desc
,
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_vec_thread_block_desc
,
p_wei_vec_block
+
wei_kcyx_vec_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
p_out_thread
);
#elif 0
threadwise_direct_convolution_3
(
in_nchw_vec_thread_block_desc
,
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_vec_thread_block_desc
,
p_wei_vec_block
+
wei_kcyx_vec_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
p_out_thread
);
#endif
}
}
// copy output tensor from register to global mem
threadwise_4d_tensor_copy
(
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_nkhw_thread_desc
.
GetLengths
());
}
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
0 → 100644
View file @
fdaaaa50
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
BPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
GemmThreadPerColumnPerCluster
,
unsigned
GemmThreadPerRowPerCluster
,
unsigned
GemmMPerThreadSubC
,
unsigned
GemmNPerThreadSubC
,
unsigned
GemmMLevel0Cluster
,
unsigned
GemmNLevel0Cluster
,
unsigned
GemmMLevel1Cluster
,
unsigned
GemmNLevel1Cluster
,
unsigned
GemmKPerThreadLoop
,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_chwn_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_cyxk_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_khwn_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_chwn_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
Hi
=
in_chwn_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Wi
=
in_chwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
N
=
in_chwn_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
out_khwn_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_khwn_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Wo
=
out_khwn_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_cyxk_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
X
=
wei_cyxk_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
B
=
N
*
Hi
*
Wi
;
constexpr
unsigned
BGhostRead
=
(
Y
-
1
)
*
Wi
+
(
X
-
1
);
// divide block work by 2d: [K, B]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
BBlockWork
=
(
B
+
BPerBlock
-
1
)
/
BPerBlock
;
const
unsigned
k_block_work_id
=
get_block_1d_id
()
/
BBlockWork
;
const
unsigned
b_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
BBlockWork
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
b_block_data_begin
=
b_block_work_id
*
BPerBlock
;
// flattend (2d) tensor view of gridwise input
constexpr
auto
in_cb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
B
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
Y
*
X
,
K
>
{});
// tensor view of blockwise input and weight
// be careful of alignment
constexpr
auto
in_cb_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
BPerBlock
+
BGhostRead
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
auto
wei_cyxk_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_kb_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
BPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
#endif
// blockwise wei copy
// format is [CPerBlock*Y*X,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_cyxk_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
>
{};
// LDS: be careful of alignment
constexpr
unsigned
in_block_size
=
in_cb_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
unsigned
wei_block_size
=
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
:
WeiBlockCopyDataPerRead
;
// LDS
__shared__
Float
p_in_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_cyxk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_cyxk_global_desc
.
GetStride
(
I0
),
__syncthreads
())
{
// load data
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
__syncthreads
();
// compute on current data
// a series of GEMM
for
(
unsigned
y
=
0
;
y
<
Y
;
++
y
)
{
for
(
unsigned
x
=
0
;
x
<
X
;
++
x
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if 1
blockwise_gemm
.
Run
#else
blockwise_gemm
.
Run_RegisterDoubleBuffer
#endif
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
y
*
Wi
+
x
,
p_out_thread
,
f_accum
);
}
}
}
// output: register to global mem,
const
auto
c_thread_mtx_begin
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
unsigned
k_thread_data_begin
=
k_block_data_begin
+
c_thread_mtx_begin
.
row
;
const
unsigned
b_thread_data_begin
=
b_block_data_begin
+
c_thread_mtx_begin
.
col
;
for
(
unsigned
k
=
0
;
k
<
out_kb_thread_desc
.
GetLength
(
I0
);
++
k
)
{
for
(
unsigned
b
=
0
;
b
<
out_kb_thread_desc
.
GetLength
(
I1
);
++
b
)
{
const
auto
c_thread_mtx_distance
=
blockwise_gemm
.
GetDistanceFromBeginOfThreadMatrixC
(
k
,
b
);
unsigned
k_data
=
k_thread_data_begin
+
c_thread_mtx_distance
.
row
;
unsigned
b_data
=
b_thread_data_begin
+
c_thread_mtx_distance
.
col
;
unsigned
h_data
=
b_data
/
(
Wi
*
N
);
unsigned
itmp
=
b_data
-
h_data
*
(
Wi
*
N
);
unsigned
w_data
=
itmp
/
N
;
unsigned
n_data
=
itmp
-
w_data
*
N
;
if
(
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
p_out_global
[
out_khwn_global_desc
.
Get1dIndex
(
k_data
,
h_data
,
w_data
,
n_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get1dIndex
(
k
,
b
)];
}
}
}
}
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
fdaaaa50
...
@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
...
@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
__syncthreads
();
__syncthreads
();
// load next data
// load next data
#if
1
#if
0
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
#elif
1
#elif
1
...
@@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
...
@@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
}
}
}
}
#if
0
#if
1
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_next
);
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_next
);
#endif
#endif
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment