Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9a2744a6
Commit
9a2744a6
authored
Mar 25, 2019
by
Chao Liu
Browse files
refactor
parent
18328e2f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
116 additions
and
123 deletions
+116
-123
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+0
-51
driver/driver.hip.cpp
driver/driver.hip.cpp
+2
-2
src/include/common.hip.hpp
src/include/common.hip.hpp
+34
-10
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+28
-57
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+52
-3
No files found.
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
View file @
9a2744a6
...
...
@@ -86,15 +86,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 8;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16;
constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4;
...
...
@@ -116,15 +107,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmThreadPerColumnPerCluster
=
8
;
constexpr
index_t
GemmThreadPerRowPerCluster
=
8
;
constexpr
index_t
InBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
InBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
InBlockCopyDataPerRead
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
4
;
...
...
@@ -147,15 +129,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmThreadPerColumnPerCluster
=
8
;
constexpr
index_t
GemmThreadPerRowPerCluster
=
8
;
constexpr
index_t
InBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
InBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
InBlockCopyDataPerRead
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
4
;
...
...
@@ -177,15 +150,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmThreadPerColumnPerCluster
=
8
;
constexpr
index_t
GemmThreadPerRowPerCluster
=
8
;
constexpr
index_t
InBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
InBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
InBlockCopyDataPerRead
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
4
;
...
...
@@ -207,15 +171,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmThreadPerColumnPerCluster
=
8
;
constexpr
index_t
GemmThreadPerRowPerCluster
=
8
;
constexpr
index_t
InBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
InBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
index_t
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
index_t
InBlockCopyDataPerRead
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
4
;
...
...
@@ -257,8 +212,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
CPerBlock
,
BPerThread
,
KPerThread
,
GemmThreadPerColumnPerCluster
,
GemmThreadPerRowPerCluster
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
...
...
@@ -266,10 +219,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
,
dim3
(
GridSize
),
...
...
driver/driver.hip.cpp
View file @
9a2744a6
...
...
@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 1
// 1x1 filter, 14x14 image, C = 12
8
// 1x1 filter, 14x14 image, C =
5
12
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
12
8
;
constexpr
index_t
C
=
5
12
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
512
;
...
...
src/include/common.hip.hpp
View file @
9a2744a6
...
...
@@ -21,21 +21,45 @@ struct is_same<T, T>
static
const
bool
value
=
true
;
};
#if DEVICE_BACKEND_CUDA
template
<
typename
T
>
__host__
__device__
constexpr
T
max
(
T
a
,
T
b
)
__host__
__device__
constexpr
index_t
integer_divide_ceil
(
index_t
a
,
index_t
b
)
{
return
a
>
b
?
a
:
b
;
return
(
a
+
b
-
1
)
/
b
;
}
template
<
typename
T
>
__host__
__device__
constexpr
T
min
(
T
a
,
T
b
)
namespace
mod_conv
{
template
<
class
T
>
__host__
__device__
constexpr
T
max
(
T
x
,
T
y
)
{
return
a
<
b
?
a
:
b
;
return
x
>
y
?
x
:
y
;
}
#endif
__host__
__device__
constexpr
index_t
integer_divide_ceil
(
index_t
a
,
index_t
b
)
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
max
(
T
x
,
Ts
...
xs
)
{
return
(
a
+
b
-
1
)
/
b
;
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
max
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
>
y
?
x
:
y
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
min
(
T
x
,
T
y
)
{
return
x
<
y
?
x
:
y
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
min
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
min
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
<
y
?
x
:
y
;
}
}
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
View file @
9a2744a6
...
...
@@ -19,8 +19,6 @@ template <index_t GridSize,
index_t
CPerBlock
,
index_t
BPerThread
,
index_t
KPerThread
,
index_t
GemmThreadPerColumnPerCluster
,
index_t
GemmThreadPerRowPerCluster
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMLevel0Cluster
,
...
...
@@ -28,10 +26,6 @@ template <index_t GridSize,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
GemmKPerThreadLoop
,
index_t
InBlockCopyThreadPerDim0
,
index_t
InBlockCopyThreadPerDim1
,
index_t
WeiBlockCopyThreadPerDim0
,
index_t
WeiBlockCopyThreadPerDim1
,
index_t
InBlockCopyDataPerRead
,
index_t
WeiBlockCopyDataPerRead
>
__global__
void
...
...
@@ -111,57 +105,23 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 1
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
#endif
// blockwise wei copy
// format is [CPerBlock*Y*X,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif
0
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
{};
#elif 1
// blockwise wei copy
// format is [CPerBlock*Y*X,KPerBlock]
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
...
...
@@ -198,16 +158,27 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
constexpr
index_t
wei_block_size
=
wei_cyxk_block_desc
.
GetElementSpace
(
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
index_t
max_align
=
InBlockCopyDataPerRead
>
WeiBlockCopyDataPerRead
?
InBlockCopyDataPerRead
:
WeiBlockCopyDataPerRead
;
constexpr
index_t
max_align
=
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
// LDS
// LDS
#if 1
__shared__
Float
p_in_block
[
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
)];
__shared__
Float
p_wei_block
[
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
)];
#if 1
const
Float
*
p_lds_begin
=
p_in_block
<
p_wei_block
?
p_in_block
:
p_wei_block
;
const
Float
*
const
p_lds_begin
=
p_in_block
<
p_wei_block
?
p_in_block
:
p_wei_block
;
#else // debug
constexpr
index_t
lds_byte
=
8
*
1024
;
constexpr
index_t
in_block_space
=
max_align
*
((
in_block_size
+
max_align
-
1
)
/
max_align
);
constexpr
index_t
wei_block_space
=
max_align
*
((
wei_block_size
+
max_align
-
1
)
/
max_align
);
static_assert
(
lds_byte
>=
(
in_block_space
+
wei_block_space
)
*
sizeof
(
Float
),
"lds allocation not enough"
);
__shared__
Float
p_lds_begin
[
lds_byte
/
sizeof
(
Float
)];
Float
*
const
p_in_block
=
p_lds_begin
;
Float
*
const
p_wei_block
=
p_lds_begin
+
in_block_space
;
#endif
const
Float
*
p_in_global_block_offset
=
...
...
@@ -240,18 +211,18 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if
1
#if
0
blockwise_gemm.Run
#elif
0
blockwise_gemm
.
Run_asm
#elif
0
#elif
1
blockwise_gemm
.
Run_RegisterDoubleBuffer
#endif
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
y
*
Wi
+
x
,
p_out_thread
,
f_accum
,
p_lds_begin
);
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
y
*
Wi
+
x
,
p_out_thread
,
f_accum
,
p_lds_begin
);
}
}
}
...
...
src/include/threadwise_gemm.hip.hpp
View file @
9a2744a6
...
...
@@ -55,17 +55,66 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
#elif 1
static_assert
(
NCol
==
4
,
"only for NCol == 4"
);
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
for
(
index_t
i
=
0
;
i
<
NRow
;
++
i
)
{
const
index_t
src_index
=
src_mtx
.
Get1dIndex
(
i
,
0
);
const
index_t
dst_index
=
dst_mtx
.
Get1dIndex
(
i
,
0
);
#if 1
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
p_src
+
src_index
));
#elif 1
#elif 0
// ds_read_b32
asm
volatile
(
"
\n
\
ds_read_b32 %0, %1
\n
\
"
:
"=v"
(
p_dst
[
dst_index
])
:
"v"
((
uint32_t
)(
sizeof
(
Float
)
*
(
uintptr_t
)((
p_src
+
src_index
)
-
p_lds_begin
))));
asm
volatile
(
"
\n
\
ds_read_b32 %0, %1
\n
\
"
:
"=v"
(
p_dst
[
dst_index
+
1
])
:
"v"
((
uint32_t
)(
sizeof
(
Float
)
*
(
uintptr_t
)((
p_src
+
src_index
+
1
)
-
p_lds_begin
))));
asm
volatile
(
"
\n
\
ds_read_b32 %0, %1
\n
\
"
:
"=v"
(
p_dst
[
dst_index
+
2
])
:
"v"
((
uint32_t
)(
sizeof
(
Float
)
*
(
uintptr_t
)((
p_src
+
src_index
+
2
)
-
p_lds_begin
))));
asm
volatile
(
"
\n
\
ds_read_b32 %0, %1
\n
\
"
:
"=v"
(
p_dst
[
dst_index
+
3
])
:
"v"
((
uint32_t
)(
sizeof
(
Float
)
*
(
uintptr_t
)((
p_src
+
src_index
+
3
)
-
p_lds_begin
))));
#elif 0
// ds_read_b64
using
vector_t
=
typename
vector_type
<
Float
,
2
>::
MemoryType
;
asm
volatile
(
"
\n
\
ds_read_b64 %0, %1
\n
\
"
:
"=v"
(
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
)))
:
"v"
((
uint32_t
)(
sizeof
(
Float
)
*
(
uintptr_t
)((
p_src
+
src_index
)
-
p_lds_begin
))));
asm
volatile
(
"
\n
\
ds_read_b64 %0, %1
\n
\
"
:
"=v"
(
*
(
reinterpret_cast
<
vector_t
*>
(
p_dst
+
dst_index
+
2
)))
:
"v"
((
uint32_t
)(
sizeof
(
Float
)
*
(
uintptr_t
)((
p_src
+
src_index
+
2
)
-
p_lds_begin
))));
#elif 0
// ds_read_b128
using
vector_t
=
typename
vector_type
<
Float
,
4
>::
MemoryType
;
asm
volatile
(
"
\n
\
ds_read_b128 %0, %1
\n
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment