Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
4f0fc72e
Commit
4f0fc72e
authored
Mar 18, 2019
by
Chao Liu
Browse files
adding fp16 direct that reads pre-vectorized data
parent
7faf269c
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
372 additions
and
178 deletions
+372
-178
driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
+43
-31
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
...device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+160
-0
driver/driver.hip.cpp
driver/driver.hip.cpp
+83
-80
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+1
-1
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+3
-5
src/include/common.hip.hpp
src/include/common.hip.hpp
+27
-11
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
...lude/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+4
-4
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
...se_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+51
-46
No files found.
driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
View file @
4f0fc72e
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#include "device.hpp"
#include "device.hpp"
//#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2_nchw_kcyx_nkhw
(
InDesc
,
void
device_direct_convolution_2_nchw_kcyx_nkhw
(
InDesc
,
...
@@ -50,6 +49,24 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
...
@@ -50,6 +49,24 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// 3x3, 34x34, 128 thread, fp16
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
@@ -61,35 +78,30 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
...
@@ -61,35 +78,30 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
float
time
=
launch_kernel
(
float
time
=
#if 0
launch_kernel
(
gridwise_direct_convolution_2_nchw_kcyx_nkhw
<
T
,
gridwise_direct_convolution_2_nchw_kcyx_nkhw
InDesc
,
#else
WeiDesc
,
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
OutDesc
,
#endif
NPerBlock
,
<
T
,
KPerBlock
,
InDesc
,
CPerBlock
,
WeiDesc
,
HoPerBlock
,
OutDesc
,
WoPerBlock
,
NPerBlock
,
NPerThread
,
KPerBlock
,
KPerThread
,
CPerBlock
,
CPerThread
,
HoPerBlock
,
HoPerThread
,
WoPerBlock
,
WoPerThread
,
NPerThread
,
InBlockCopyDataPerRead
,
KPerThread
,
WeiBlockCopyDataPerRead
,
CPerThread
,
BlockSize
,
HoPerThread
,
GridSize
>
,
WoPerThread
,
dim3
(
GridSize
),
InBlockCopyDataPerRead
,
dim3
(
BlockSize
),
WeiBlockCopyDataPerRead
,
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
BlockSize
,
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
GridSize
>
,
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
...
...
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
4f0fc72e
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
{
constexpr
unsigned
NVector
=
1
;
using
vector_type_t
=
vector_type
<
T
,
NVector
>
;
using
vector_t
=
typename
vector_type_t
::
VectorType
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
unsigned
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// vectorized input
auto
in_nchw_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
/
NVector
,
Hi
,
Wi
>
{});
ostream_ConstantTensorDescriptor
(
in_nchw_vec_desc
,
std
::
cout
<<
"in_nchw_vec_desc: "
);
Tensor
<
vector_t
>
in_nchw_vec
(
make_TensorDescriptor
(
in_nchw_vec_desc
));
auto
f_vectorized_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
h
,
auto
w
)
{
#if 1
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
in_nchw
(
n
,
c
,
h
,
w
);
#else
in_nchw_vec
(
n
,
c
,
h
,
w
)
=
vector_type_t
::
pack
(
in_nchw
(
n
,
2
*
c
,
h
,
w
),
in_nchw
(
n
,
2
*
c
+
1
,
h
,
w
));
#endif
};
make_ParallelTensorFunctor
(
f_vectorized_nchw
,
N
,
C
,
Hi
,
Wi
)(
std
::
thread
::
hardware_concurrency
());
// vectorize weight
auto
wei_kcyx_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
/
NVector
,
Y
,
X
>
{});
ostream_ConstantTensorDescriptor
(
wei_kcyx_vec_desc
,
std
::
cout
<<
"wei_kcyx_vec_desc: "
);
Tensor
<
vector_t
>
wei_kcyx_vec
(
make_TensorDescriptor
(
wei_kcyx_vec_desc
));
auto
f_vectorized_kcyx
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
#if 1
wei_kcyx_vec
(
k
,
c
,
y
,
x
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
#else
wei_kcyx_vec
(
k
,
c
,
y
,
x
)
=
vector_type_t
::
pack
(
wei_kcyx
(
k
,
2
*
c
,
y
,
x
),
wei_kcyx
(
k
,
2
*
c
+
1
,
y
,
x
));
#endif
};
make_ParallelTensorFunctor
(
f_vectorized_kcyx
,
K
,
C
,
Y
,
X
)(
std
::
thread
::
hardware_concurrency
());
//
DeviceMem
in_nchw_vec_device_buf
(
sizeof
(
vector_t
)
*
in_nchw_vec
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_vec_device_buf
(
sizeof
(
vector_t
)
*
wei_kcyx_vec
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
sizeof
(
T
)
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_vec_device_buf
.
ToDevice
(
in_nchw_vec
.
mData
.
data
());
wei_kcyx_vec_device_buf
.
ToDevice
(
wei_kcyx_vec
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
// 3x3, 34x34, 128 thread
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// 3x3, 34x34, 128 thread, fp16
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
InBlockCopyDataPerRead
=
2
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#endif
constexpr
unsigned
GridSize
=
(
N
/
NPerBlock
)
*
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
<
T
,
decltype
(
in_nchw_vec_desc
),
decltype
(
wei_kcyx_vec_desc
),
decltype
(
out_nkhw_desc
),
NVector
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
HoPerThread
,
WoPerThread
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
,
BlockSize
,
GridSize
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
static_cast
<
T
*>
(
in_nchw_vec_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_vec_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms
\n
"
,
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/driver.hip.cpp
View file @
4f0fc72e
...
@@ -9,6 +9,7 @@
...
@@ -9,6 +9,7 @@
#include "conv_common.hip.hpp"
#include "conv_common.hip.hpp"
//#include "device_direct_convolution_1.hpp"
//#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
...
@@ -34,25 +35,6 @@ struct GeneratorTensor_2
...
@@ -34,25 +35,6 @@ struct GeneratorTensor_2
}
}
};
};
struct
GeneratorTensor_3
{
template
<
class
...
Is
>
double
operator
()(
Is
...
is
)
{
#if 0
std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
#elif
1
assert
(
sizeof
...(
Is
)
>
0
);
std
::
initializer_list
<
std
::
size_t
>
ids
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
vector
<
std
::
size_t
>
lens
(
sizeof
...(
Is
),
100
);
std
::
vector
<
std
::
size_t
>
strides
(
sizeof
...(
Is
),
1
);
std
::
partial_sum
(
lens
.
rbegin
(),
lens
.
rbegin
()
+
(
sizeof
...(
Is
)
-
1
),
strides
.
rbegin
()
+
1
);
return
std
::
inner_product
(
ids
.
begin
(),
ids
.
end
(),
strides
.
begin
(),
std
::
size_t
(
0
))
+
1
;
#endif
}
};
struct
GeneratorTensor_Checkboard
struct
GeneratorTensor_Checkboard
{
{
template
<
class
...
Ts
>
template
<
class
...
Ts
>
...
@@ -129,7 +111,7 @@ void host_direct_convolution(
...
@@ -129,7 +111,7 @@ void host_direct_convolution(
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
{
{
v
+=
in_nchw
(
n
,
c
,
hi
,
wi
)
*
wei_kcyx
(
k
,
c
,
y
,
x
);
v
+=
double
(
in_nchw
(
n
,
c
,
hi
,
wi
)
)
*
double
(
wei_kcyx
(
k
,
c
,
y
,
x
)
)
;
}
}
}
}
}
}
...
@@ -177,11 +159,11 @@ void host_winograd_3x3_convolution(
...
@@ -177,11 +159,11 @@ void host_winograd_3x3_convolution(
std
::
size_t
HTile
=
(
HO
+
HoPerTile
-
1
)
/
HoPerTile
;
std
::
size_t
HTile
=
(
HO
+
HoPerTile
-
1
)
/
HoPerTile
;
std
::
size_t
WTile
=
(
WO
+
WoPerTile
-
1
)
/
WoPerTile
;
std
::
size_t
WTile
=
(
WO
+
WoPerTile
-
1
)
/
WoPerTile
;
Tensor
<
T
>
in_hold
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
in_hold
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
T
>
in_transform
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
in_transform
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
T
>
wei_transform
({
K
,
C
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
wei_transform
({
K
,
C
,
HiPerTile
,
WiPerTile
});
Tensor
<
T
>
out_transform
({
N
,
K
,
HTile
,
WTile
,
HiPerTile
,
HiPerTile
});
Tensor
<
double
>
out_transform
({
N
,
K
,
HTile
,
WTile
,
HiPerTile
,
HiPerTile
});
Tensor
<
T
>
out_hold
({
N
,
K
,
HTile
,
WTile
,
HoPerTile
,
WoPerTile
});
Tensor
<
double
>
out_hold
({
N
,
K
,
HTile
,
WTile
,
HoPerTile
,
WoPerTile
});
auto
f_in_hold
=
[
&
](
auto
n
,
auto
c
,
auto
htile
,
auto
wtile
)
{
auto
f_in_hold
=
[
&
](
auto
n
,
auto
c
,
auto
htile
,
auto
wtile
)
{
for
(
int
j
=
0
;
j
<
HiPerTile
;
++
j
)
for
(
int
j
=
0
;
j
<
HiPerTile
;
++
j
)
...
@@ -259,49 +241,61 @@ void host_winograd_3x3_convolution(
...
@@ -259,49 +241,61 @@ void host_winograd_3x3_convolution(
};
};
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
auto
f_wei_transform
=
[
&
](
auto
k
,
auto
c
)
{
wei_transform
(
k
,
c
,
0
,
0
)
=
wei_kcyx
(
k
,
c
,
0
,
0
);
wei_transform
(
k
,
c
,
0
,
0
)
=
double
(
wei_kcyx
(
k
,
c
,
0
,
0
));
wei_transform
(
k
,
c
,
0
,
1
)
=
wei_transform
(
k
,
c
,
0
,
1
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
);
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
wei_transform
(
k
,
c
,
0
,
2
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
));
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
);
wei_transform
(
k
,
c
,
0
,
2
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
wei_transform
(
k
,
c
,
0
,
3
)
=
wei_kcyx
(
k
,
c
,
0
,
2
);
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
));
wei_transform
(
k
,
c
,
1
,
0
)
=
wei_transform
(
k
,
c
,
0
,
3
)
=
double
(
wei_kcyx
(
k
,
c
,
0
,
2
));
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
1
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
1
,
0
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
));
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
wei_transform
(
k
,
c
,
1
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
wei_transform
(
k
,
c
,
1
,
2
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
1
,
2
)
=
wei_transform
(
k
,
c
,
1
,
3
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
)
+
0.5
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
wei_transform
(
k
,
c
,
2
,
0
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.5
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
2
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
wei_transform
(
k
,
c
,
1
,
3
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
-
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
wei_transform
(
k
,
c
,
2
,
0
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
wei_transform
(
k
,
c
,
2
,
2
)
=
0.25
*
wei_kcyx
(
k
,
c
,
0
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
0
,
1
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
0.25
*
wei_kcyx
(
k
,
c
,
0
,
2
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
0
)
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
));
0.25
*
wei_kcyx
(
k
,
c
,
1
,
1
)
-
0.25
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
wei_transform
(
k
,
c
,
2
,
1
)
=
0.25
*
wei_kcyx
(
k
,
c
,
2
,
0
)
-
0.25
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.25
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
-
wei_transform
(
k
,
c
,
2
,
3
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.5
*
wei_kcyx
(
k
,
c
,
0
,
2
)
-
0.5
*
wei_kcyx
(
k
,
c
,
1
,
2
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
0
)
=
wei_kcyx
(
k
,
c
,
2
,
0
);
wei_transform
(
k
,
c
,
2
,
2
)
=
wei_transform
(
k
,
c
,
3
,
1
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
1
))
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
0
))
+
wei_transform
(
k
,
c
,
3
,
2
)
=
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
1
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
0
)
-
0.5
*
wei_kcyx
(
k
,
c
,
2
,
1
)
+
0.5
*
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
-
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
wei_transform
(
k
,
c
,
3
,
3
)
=
wei_kcyx
(
k
,
c
,
2
,
2
);
0.25
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
2
,
3
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
0
,
2
))
-
0.5
*
double
(
wei_kcyx
(
k
,
c
,
1
,
2
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
0
)
=
double
(
wei_kcyx
(
k
,
c
,
2
,
0
));
wei_transform
(
k
,
c
,
3
,
1
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
2
)
=
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
0
))
-
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
1
))
+
0.5
*
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
wei_transform
(
k
,
c
,
3
,
3
)
=
double
(
wei_kcyx
(
k
,
c
,
2
,
2
));
};
};
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
htile
,
auto
wtile
)
{
auto
f_out_transform
=
[
&
](
auto
n
,
auto
k
,
auto
htile
,
auto
wtile
)
{
...
@@ -372,20 +366,25 @@ void host_winograd_3x3_convolution(
...
@@ -372,20 +366,25 @@ void host_winograd_3x3_convolution(
template
<
class
T
>
template
<
class
T
>
void
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
void
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
{
// printf("\n");
float
error
=
0
;
float
error
=
0
;
float
max_diff
=
-
1
;
float
max_diff
=
-
1
;
float
ref_value
=
0
,
result_value
=
0
;
float
ref_value
=
0
,
result_value
=
0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
error
+=
std
::
abs
(
ref
.
mData
[
i
]
-
result
.
mData
[
i
]);
error
+=
std
::
abs
(
double
(
ref
.
mData
[
i
]
)
-
double
(
result
.
mData
[
i
])
)
;
float
diff
=
std
::
abs
(
ref
.
mData
[
i
]
-
result
.
mData
[
i
]);
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
]
)
-
double
(
result
.
mData
[
i
])
)
;
if
(
max_diff
<
diff
)
if
(
max_diff
<
diff
)
{
{
max_diff
=
diff
;
max_diff
=
diff
;
ref_value
=
ref
.
mData
[
i
];
ref_value
=
ref
.
mData
[
i
];
result_value
=
result
.
mData
[
i
];
result_value
=
result
.
mData
[
i
];
}
}
// printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i]));
}
}
// printf("\n");
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
...
@@ -406,13 +405,13 @@ int main(int argc, char* argv[])
...
@@ -406,13 +405,13 @@ int main(int argc, char* argv[])
constexpr unsigned WPad = 0;
constexpr unsigned WPad = 0;
#elif
1
#elif
1
// 3x3, 34x34
// 3x3, 34x34
constexpr
unsigned
N
=
64
;
constexpr
unsigned
N
=
64
;
constexpr
unsigned
C
=
256
;
constexpr
unsigned
C
=
256
;
constexpr
unsigned
HI
=
34
;
constexpr
unsigned
HI
=
34
;
constexpr
unsigned
WI
=
34
;
constexpr
unsigned
WI
=
34
;
constexpr
unsigned
K
=
64
;
constexpr
unsigned
K
=
64
;
constexpr
unsigned
Y
=
3
;
constexpr
unsigned
Y
=
3
;
constexpr
unsigned
X
=
3
;
constexpr
unsigned
X
=
3
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
constexpr
unsigned
WPad
=
0
;
...
@@ -603,16 +602,22 @@ int main(int argc, char* argv[])
...
@@ -603,16 +602,22 @@ int main(int argc, char* argv[])
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 1
#elif 1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
2
,
2
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
#endif
#endif
}
}
#if 1
#if 1
#if 0
#if 0
device_direct_convolution_1
device_direct_convolution_1
#elif
1
#elif
0
device_direct_convolution_2_nchw_kcyx_nkhw
device_direct_convolution_2_nchw_kcyx_nkhw
#elif 1
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 0
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0
#elif 0
...
@@ -634,7 +639,6 @@ int main(int argc, char* argv[])
...
@@ -634,7 +639,6 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if 1
if
(
Y
==
3
&&
X
==
3
)
if
(
Y
==
3
&&
X
==
3
)
{
{
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
...
@@ -644,7 +648,6 @@ int main(int argc, char* argv[])
...
@@ -644,7 +648,6 @@ int main(int argc, char* argv[])
host_direct_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_direct_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
}
}
check_error
(
out_nkhw_host
,
out_nkhw_device
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#endif
#if 0
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
...
...
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
4f0fc72e
...
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
...
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise2dTensorCopy3
struct
Blockwise2dTensorCopy3
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
t
ype
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
VectorT
ype
;
unsigned
mSrcMyThreadOffset
;
unsigned
mSrcMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
4f0fc72e
...
@@ -207,9 +207,9 @@ template <unsigned BlockSize,
...
@@ -207,9 +207,9 @@ template <unsigned BlockSize,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise4dTensorCopy1
struct
Blockwise4dTensorCopy1
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
t
ype
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
VectorT
ype
;
__device__
void
SanityCheck
()
const
__device__
constexpr
Blockwise4dTensorCopy1
()
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -239,8 +239,6 @@ struct Blockwise4dTensorCopy1
...
@@ -239,8 +239,6 @@ struct Blockwise4dTensorCopy1
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
{
SanityCheck
();
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
...
@@ -446,7 +444,7 @@ template <unsigned BlockSize,
...
@@ -446,7 +444,7 @@ template <unsigned BlockSize,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise4dTensorCopy3
struct
Blockwise4dTensorCopy3
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
t
ype
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
VectorT
ype
;
unsigned
mSrcMyThreadOffset
;
unsigned
mSrcMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
...
...
src/include/common.hip.hpp
View file @
4f0fc72e
...
@@ -28,44 +28,44 @@ struct vector_type
...
@@ -28,44 +28,44 @@ struct vector_type
template
<
>
template
<
>
struct
vector_type
<
float
,
1
>
struct
vector_type
<
float
,
1
>
{
{
using
t
ype
=
float
;
using
VectorT
ype
=
float
;
};
};
template
<
>
template
<
>
struct
vector_type
<
float
,
2
>
struct
vector_type
<
float
,
2
>
{
{
using
t
ype
=
float2
;
using
VectorT
ype
=
float2
;
};
};
template
<
>
template
<
>
struct
vector_type
<
float
,
4
>
struct
vector_type
<
float
,
4
>
{
{
using
t
ype
=
float4
;
using
VectorT
ype
=
float4
;
};
};
#if 0
#if 0
template <>
template <>
struct vector_type<half_float::half, 1>
struct vector_type<half_float::half, 1>
{
{
using
t
ype = half_float::half;
using
VectorT
ype = half_float::half;
};
};
template <>
template <>
struct vector_type<half_float::half, 2>
struct vector_type<half_float::half, 2>
{
{
using
t
ype = float;
using
VectorT
ype = float;
};
};
template <>
template <>
struct vector_type<half_float::half, 4>
struct vector_type<half_float::half, 4>
{
{
using
t
ype = float2;
using
VectorT
ype = float2;
};
};
template <>
template <>
struct vector_type<half_float::half, 8>
struct vector_type<half_float::half, 8>
{
{
using
t
ype = float4;
using
VectorT
ype = float4;
};
};
#endif
#endif
...
@@ -73,25 +73,41 @@ struct vector_type<half_float::half, 8>
...
@@ -73,25 +73,41 @@ struct vector_type<half_float::half, 8>
template
<
>
template
<
>
struct
vector_type
<
half
,
1
>
struct
vector_type
<
half
,
1
>
{
{
using
type
=
half
;
using
VectorType
=
half
;
__host__
__device__
static
VectorType
pack
(
half
s
)
{
return
s
;
}
};
};
template
<
>
template
<
>
struct
vector_type
<
half
,
2
>
struct
vector_type
<
half
,
2
>
{
{
using
type
=
half2
;
using
VectorType
=
half2
;
union
Data
{
VectorType
vector
;
half
scalar
[
2
];
};
__host__
__device__
static
VectorType
pack
(
half
s0
,
half
s1
)
{
Data
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
}
};
};
template
<
>
template
<
>
struct
vector_type
<
half
,
4
>
struct
vector_type
<
half
,
4
>
{
{
using
t
ype
=
float2
;
using
VectorT
ype
=
float2
;
};
};
template
<
>
template
<
>
struct
vector_type
<
half
,
8
>
struct
vector_type
<
half
,
8
>
{
{
using
t
ype
=
float4
;
using
VectorT
ype
=
float4
;
};
};
#endif
#endif
...
...
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
View file @
4f0fc72e
...
@@ -25,10 +25,10 @@ template <class Float,
...
@@ -25,10 +25,10 @@ template <class Float,
unsigned
WeiBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
__global__
void
const
Float
*
const
__restrict__
p_in_global
,
gridwise_direct_convolution_2_nchw_kcyx_nkhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
View file @
4f0fc72e
...
@@ -11,6 +11,7 @@ template <class Float,
...
@@ -11,6 +11,7 @@ template <class Float,
class
InGlobalDesc
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
class
OutGlobalDesc
,
unsigned
ScalarPerVector
,
unsigned
NPerBlock
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
CPerBlock
,
...
@@ -26,47 +27,50 @@ template <class Float,
...
@@ -26,47 +27,50 @@ template <class Float,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
__global__
void
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
typename
vector_type
<
Float
,
ScalarPerVector
>::
VectorType
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
const
typename
vector_type
<
Float
,
ScalarPerVector
>::
VectorType
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
Float
*
const
__restrict__
p_out_global
)
{
{
using
scalar_t
=
Float
;
using
vector_t
=
typename
vector_type
<
scalar_t
,
ScalarPerVector
>::
VectorType
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_nchw_
vec_
global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_kcyx_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_kcyx_
vec_
global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_nkhw_global_desc
=
OutGlobalDesc
{};
constexpr
auto
out_nkhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
N
=
in_nchw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
N
=
in_nchw_
vec_
global_desc
.
GetLength
(
I0
);
constexpr
unsigned
K
=
wei_kcyx_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
K
=
wei_kcyx_
vec_
global_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcyx_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
C
=
wei_kcyx_
vec_
global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Y
=
wei_kcyx_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Y
=
wei_kcyx_
vec_
global_desc
.
GetLength
(
I2
);
constexpr
unsigned
X
=
wei_kcyx_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
X
=
wei_kcyx_
vec_
global_desc
.
GetLength
(
I3
);
constexpr
auto
wei_ke_global_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
wei_ke_
vec_
global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
*
Y
*
X
>
{});
// 2d view of wei for blockwise copy
Sequence
<
K
,
C
*
Y
*
X
>
{});
// 2d view of wei for blockwise copy
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
HiPerBlock
=
HoPerBlock
+
Y
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
unsigned
WiPerBlock
=
WoPerBlock
+
X
-
1
;
constexpr
auto
in_nchw_block_desc
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
in_nchw_
vec_
block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
Sequence
<
NPerBlock
,
CPerBlock
,
HiPerBlock
,
WiPerBlock
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
auto
wei_ke_block_desc
=
make_ConstantTensorDescriptor_aligned
(
constexpr
auto
wei_ke_
vec_
block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
KPerBlock
,
CPerBlock
*
Y
*
X
>
{},
Sequence
<
KPerBlock
,
CPerBlock
*
Y
*
X
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
// 2d view of wei for blockwise copy
Number
<
WeiBlockCopyDataPerRead
>
{});
// 2d view of wei for blockwise copy
constexpr
auto
wei_kcyx_block_desc
=
constexpr
auto
wei_kcyx_
vec_
block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{},
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
Y
,
X
>
{},
Sequence
<
wei_ke_block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
Sequence
<
wei_ke_
vec_
block_desc
.
GetStride
(
I0
),
Y
*
X
,
X
,
1
>
{});
// shared mem
// shared mem
constexpr
unsigned
in_block_size
=
constexpr
unsigned
in_block_size
=
in_nchw_block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
in_nchw_
vec_
block_desc
.
GetElementSpace
(
Number
<
InBlockCopyDataPerRead
>
{});
constexpr
unsigned
wei_block_size
=
constexpr
unsigned
wei_block_size
=
wei_kcyx_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
wei_kcyx_
vec_
block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
constexpr
unsigned
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
?
InBlockCopyDataPerRead
...
@@ -81,10 +85,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
...
@@ -81,10 +85,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
constexpr
auto
in_nchw_thread_block_desc
=
constexpr
auto
in_nchw_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
HiPerThread
,
WiPerThread
>
{},
in_nchw_block_desc
.
GetStrides
());
in_nchw_
vec_
block_desc
.
GetStrides
());
constexpr
auto
wei_kcyx_thread_block_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
wei_kcyx_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_kcyx_block_desc
.
GetStrides
());
Sequence
<
KPerThread
,
CPerThread
,
Y
,
X
>
{},
wei_kcyx_
vec_
block_desc
.
GetStrides
());
constexpr
auto
out_nkhw_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
constexpr
auto
out_nkhw_thread_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_nchw_thread_block_desc
,
wei_kcyx_thread_block_desc
);
in_nchw_thread_block_desc
,
wei_kcyx_thread_block_desc
);
...
@@ -147,26 +151,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
...
@@ -147,26 +151,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
constexpr
auto
blockwise_in_copy
=
constexpr
auto
blockwise_in_copy
=
Blockwise4dTensorCopy1
<
BlockSize
,
Blockwise4dTensorCopy1
<
BlockSize
,
Float
,
Float
,
decltype
(
in_nchw_global_desc
),
decltype
(
in_nchw_
vec_
global_desc
),
decltype
(
in_nchw_block_desc
),
decltype
(
in_nchw_
vec_
block_desc
),
decltype
(
in_nchw_block_desc
.
GetLengths
()),
decltype
(
in_nchw_
vec_
block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
InBlockCopyDataPerRead
>
{};
#if 0
#if 0
constexpr auto blockwise_wei_copy =
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Blockwise4dTensorCopy1<BlockSize,
Float,
Float,
decltype(wei_kcyx_global_desc),
decltype(wei_kcyx_
vec_
global_desc),
decltype(wei_kcyx_block_desc),
decltype(wei_kcyx_
vec_
block_desc),
decltype(wei_kcyx_block_desc.GetLengths()),
decltype(wei_kcyx_
vec_
block_desc.GetLengths()),
1>{};
1>{};
#elif
1
#elif
1
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
const
auto
blockwise_wei_copy
=
Float
,
Blockwise2dTensorCopy3
<
BlockSize
,
decltype
(
wei_ke_global_desc
),
Float
,
decltype
(
wei_ke_block_desc
),
decltype
(
wei_ke_vec_global_desc
),
decltype
(
wei_ke_block_desc
.
GetLengths
()),
decltype
(
wei_ke_vec_block_desc
),
WeiBlockCopyDataPerRead
>
{};
decltype
(
wei_ke_vec_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
#endif
// set threadwise output tensor to 0
// set threadwise output tensor to 0
...
@@ -176,14 +181,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
...
@@ -176,14 +181,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
blockwise_in_copy
.
Run
(
p_in_global
+
in_nchw_
vec_
global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
wi_block_data_begin
),
p_in_block
);
p_in_block
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_kcyx_global_desc
.
Get1dIndex
(
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_kcyx_
vec_
global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_block
);
p_wei_block
);
...
@@ -195,25 +200,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
...
@@ -195,25 +200,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
#if 1
#if 1
threadwise_direct_convolution_2
(
threadwise_direct_convolution_2
(
in_nchw_thread_block_desc
,
in_nchw_thread_block_desc
,
p_in_block
+
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_nchw_
vec_
block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
wei_kcyx_thread_block_desc
,
wei_kcyx_thread_block_desc
,
p_wei_block
+
p_wei_block
+
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
wei_kcyx_
vec_
block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
out_nkhw_thread_desc
,
p_out_thread
);
p_out_thread
);
#elif 0
#elif 0
threadwise_direct_convolution_3
(
threadwise_direct_convolution_3
(
in_nchw_thread_block_desc
,
in_nchw_thread_block_desc
,
p_in_block
+
in_nchw_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_nchw_
vec_
block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
wei_kcyx_thread_block_desc
,
wei_kcyx_thread_block_desc
,
p_wei_block
+
p_wei_block
+
wei_kcyx_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
wei_kcyx_
vec_
block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
out_nkhw_thread_desc
,
p_out_thread
);
p_out_thread
);
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment