Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6614729a
Commit
6614729a
authored
Feb 05, 2019
by
Chao Liu
Browse files
add another version of blockwise 2d copy, refactor
parent
4b616aad
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
462 additions
and
453 deletions
+462
-453
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+6
-1
src/include/blockwise_2d_tensor_op.cuh
src/include/blockwise_2d_tensor_op.cuh
+97
-77
src/include/blockwise_4d_tensor_op.cuh
src/include/blockwise_4d_tensor_op.cuh
+5
-34
src/include/blockwise_gemm.cuh
src/include/blockwise_gemm.cuh
+6
-6
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+18
-18
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+12
-12
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
...e/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+27
-28
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
...ise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
+40
-42
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
...gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
+39
-41
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
...nclude/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
+25
-26
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+21
-22
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+51
-40
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
+50
-37
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+31
-33
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
+34
-36
No files found.
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
6614729a
...
...
@@ -86,6 +86,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
constexpr
unsigned
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
WeiBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
64
;
#endif
...
...
@@ -137,7 +140,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
<<<
grid_dim
,
block_dim
>>>
(
in_cnhw_desc
,
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
()),
wei_csrk_desc
,
...
...
src/include/blockwise_2d_tensor_op.cuh
View file @
6614729a
...
...
@@ -162,9 +162,9 @@ blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
}
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
b
lockwise
_
2d
_t
ensor
_c
opy
_
1
struct
B
lockwise2d
T
ensor
C
opy1
{
__device__
void
r
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
R
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
>
{};
...
...
@@ -173,6 +173,8 @@ struct blockwise_2d_tensor_copy_1
}
};
// need to be aligned to float4 and float2
// stride1 need to be 1 for both source and destination
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
...
...
@@ -180,21 +182,27 @@ template <unsigned BlockSize,
class
SrcOpLengths
,
unsigned
ThreadPerDim0
,
unsigned
ThreadPerDim1
>
struct
b
lockwise
_
2d
_t
ensor
_c
opy
_
2
struct
B
lockwise2d
T
ensor
C
opy2
{
unsigned
mThreadId0
;
unsigned
mThreadId1
;
__device__
b
lockwise
_
2d
_t
ensor
_c
opy
_
2
()
__device__
B
lockwise2d
T
ensor
C
opy2
()
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! type is not float!
\n
"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_assert
(
SrcDesc
{}.
GetStride
(
I1
)
==
1
&&
DstDesc
{}.
GetStride
(
I1
)
==
1
,
"wrong! stride is not 1!
\n
"
);
mThreadId0
=
get_thread_local_1d_id
()
/
ThreadPerDim1
;
mThreadId1
=
get_thread_local_1d_id
()
-
mThreadId0
*
ThreadPerDim1
;
}
__device__
void
r
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
R
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
if
(
get_thread_local_1d_id
()
>=
ThreadPerDim0
*
ThreadPerDim1
)
return
;
...
...
@@ -227,22 +235,12 @@ struct blockwise_2d_tensor_copy_2
for
(
unsigned
d1v4loop
=
0
;
d1v4loop
<
Dim1V4Loop
;
++
d1v4loop
)
{
unsigned
did1
=
d1v4loop
*
4
*
ThreadPerDim1
+
4
*
mThreadId1
;
#if 1
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
float4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
float4
*>
(
p_src
+
sindex
));
#else
for
(
unsigned
i
=
0
;
i
<
4
;
++
i
)
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
#endif
}
// v2
...
...
@@ -251,22 +249,11 @@ struct blockwise_2d_tensor_copy_2
unsigned
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
d1v2loop
*
2
*
ThreadPerDim1
+
2
*
mThreadId1
;
#if 1
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
float2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
float2
*>
(
p_src
+
sindex
));
#else
for
(
unsigned
i
=
0
;
i
<
2
;
++
i
)
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
#endif
}
// v1
...
...
@@ -310,22 +297,11 @@ struct blockwise_2d_tensor_copy_2
{
unsigned
did1
=
d1v4loop
*
4
*
ThreadPerDim1
+
4
*
mThreadId1
;
#if 1
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
float4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
float4
*>
(
p_src
+
sindex
));
#else
for
(
unsigned
i
=
0
;
i
<
4
;
++
i
)
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
#endif
}
// v2
...
...
@@ -334,22 +310,11 @@ struct blockwise_2d_tensor_copy_2
unsigned
did1
=
Dim1V4Loop
*
4
*
ThreadPerDim1
+
d1v2loop
*
2
*
ThreadPerDim1
+
2
*
mThreadId1
;
#if 1
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
float2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
float2
*>
(
p_src
+
sindex
));
#else
for
(
unsigned
i
=
0
;
i
<
2
;
++
i
)
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
+
i
);
p_dst
[
dindex
]
=
p_src
[
sindex
];
}
#endif
}
// v1
...
...
@@ -385,49 +350,104 @@ struct blockwise_2d_tensor_copy_2
}
};
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
blockwise_2d_tensor_copy_dummy_1
// starting point need to be aligned to float4 or float2 or float
// stride1 need to be 1 for both source and destination
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
unsigned
DataPerRead
>
struct
Blockwise2dTensorCopy3
{
unsigned
mBegin
;
unsigned
mSrcMyThreadOffset
;
unsigned
mDstMyThreadOffset
;
__device__
b
lockwise
_
2d
_t
ensor
_c
opy
_dummy_1
()
__device__
B
lockwise2d
T
ensor
C
opy
3
()
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_assert
(
SrcDesc
{}.
GetStride
(
I1
)
==
1
&&
DstDesc
{}.
GetStride
(
I1
)
==
1
,
"wrong! only support stride1 == 1!
\n
"
);
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
constexpr
unsigned
L0
=
SrcOpLengths
{}.
Get
(
I0
);
constexpr
unsigned
L1
=
SrcOpLengths
{}.
Get
(
I1
);
static_assert
(
L1
%
DataPerRead
==
0
,
"wrong! only support mod(L1, DataPerRead) == 0
\n
"
);
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
constexpr
unsigned
thread_per_d1
=
L1
/
DataPerRead
;
constexpr
unsigned
thread_per_d0
=
BlockSize
/
thread_per_d1
;
mBegin
=
n_per_thread
*
get_thread_local_1d_id
();
static_assert
(
thread_per_d1
<=
BlockSize
,
"wrong! not enough threads to cover L1 dimension
\n
"
);
const
unsigned
thread_id_d0
=
get_thread_local_1d_id
()
/
thread_per_d1
;
const
unsigned
thread_id_d1
=
get_thread_local_1d_id
()
-
thread_id_d0
*
thread_per_d1
;
mSrcMyThreadOffset
=
SrcDesc
{}.
Get1dIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
mDstMyThreadOffset
=
DstDesc
{}.
Get1dIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
}
__device__
void
r
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
R
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
using
Float2
=
float2
;
using
Float4
=
float4
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
unsigned
L0
=
SrcOpLengths
{}.
Get
(
I0
);
constexpr
unsigned
L1
=
SrcOpLengths
{}.
Get
(
I1
);
constexpr
unsigned
thread_per_d1
=
L1
/
DataPerRead
;
constexpr
unsigned
thread_per_d0
=
BlockSize
/
thread_per_d1
;
constexpr
unsigned
n
_per
_thread
=
n_total
/
BlockSize
;
constexpr
unsigned
n
um_active
_thread
=
thread_per_d0
*
thread_per_d1
;
for
(
unsigned
i
=
0
;
i
<
n_per_thread
;
++
i
)
if
(
BlockSize
>
num_active_thread
)
{
if
(
get_thread_local_1d_id
()
>
num_active_thread
)
{
p_dst
[
mBegin
+
i
]
=
p_src
[
mBegin
+
i
]
;
return
;
}
}
};
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
blockwise_2d_tensor_copy_dummy_2
{
__device__
void
run
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
bool
has_tail_d0
=
(
L0
>
nloop_d0
*
thread_per_d0
);
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
constexpr
unsigned
src_loop_stride
=
SrcDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
for
(
unsigned
i
=
0
;
i
<
n_per_thread
;
++
i
)
for
(
unsigned
i
loop
=
0
;
i
loop
<
nloop_d0
;
++
i
loop
)
{
unsigned
index
=
get_thread_local_1d_id
()
+
BlockSize
*
i
;
p_dst
[
index
]
=
p_src
[
index
];
if
(
DataPerRead
==
1
)
{
p_dst
[
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
]
=
p_src
[
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
];
}
else
if
(
DataPerRead
==
2
)
{
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
Float2
*>
(
p_src
+
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
));
}
else
if
(
DataPerRead
==
4
)
{
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
mDstMyThreadOffset
+
iloop
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
Float4
*>
(
p_src
+
mSrcMyThreadOffset
+
iloop
*
src_loop_stride
));
}
else
{
assert
(
false
);
}
}
}
};
src/include/blockwise_4d_tensor_op.cuh
View file @
6614729a
...
...
@@ -200,9 +200,9 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
}
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
struct
B
lockwise4d
T
ensor
C
opy1
{
__device__
void
r
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
R
un
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
...
...
@@ -217,9 +217,9 @@ template <unsigned BlockSize,
class
DstDesc
,
class
DstOpLengths
,
class
GlobalLowerPads
>
struct
b
lockwise
_c
hwn
_t
ensor
_c
opy
_with_padding
struct
B
lockwise
C
hwn
T
ensor
C
opy
Padded
{
__device__
void
r
un
(
Float
*
const
__restrict__
p_src
,
__device__
void
R
un
(
Float
*
const
__restrict__
p_src
,
unsigned
c_block_data_begin
,
unsigned
ho_block_data_begin
,
unsigned
wo_block_data_begin
,
...
...
@@ -337,32 +337,3 @@ struct blockwise_chwn_tensor_copy_with_padding
}
}
};
\ No newline at end of file
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
blockwise_4d_tensor_copy_dummy
{
unsigned
mBegin
;
__device__
blockwise_4d_tensor_copy_dummy
()
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
mBegin
=
n_per_thread
*
get_thread_local_1d_id
();
}
__device__
void
run
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
for
(
unsigned
i
=
0
;
i
<
n_per_thread
;
++
i
)
{
p_dst
[
mBegin
+
i
]
=
p_src
[
mBegin
+
i
];
}
}
};
src/include/blockwise_gemm.cuh
View file @
6614729a
...
...
@@ -15,7 +15,7 @@ template <unsigned BlockSize,
unsigned
BatchPerThread
,
unsigned
KPerThreadLoop
,
bool
DistributeThreadAlongColumnFirst
>
struct
b
lockwise
_
1d
_s
trided
_b
atched
_g
emm
_b
lock
_a_block_b_t
hread
_c
struct
B
lockwise1d
S
trided
B
atched
G
emm
B
lock
ABlockBT
hread
C
{
unsigned
mMyThreadOffsetA
=
0
;
unsigned
mMyThreadOffsetB
=
0
;
...
...
@@ -27,7 +27,7 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
unsigned
col_begin
;
};
__device__
b
lockwise
_
1d
_s
trided
_b
atched
_g
emm
_b
lock
_a_block_b_t
hread
_c
()
__device__
B
lockwise1d
S
trided
B
atched
G
emm
B
lock
ABlockBT
hread
C
()
{
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
...
...
@@ -117,7 +117,7 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
__device__
void
r
un
(
FloatA
*
const
p_a_block
,
__device__
void
R
un
(
FloatA
*
const
p_a_block
,
FloatB
*
const
p_b_block
,
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
const
...
...
@@ -230,7 +230,7 @@ template <unsigned BlockSize,
unsigned
MThreadPerCluster
,
unsigned
NThreadPerCluster
,
bool
DistributeThreadAlongColumnFirst
>
struct
b
lockwise
_g
emm
_b
lock
_a_block_b_t
hread
_c
struct
B
lockwise
G
emm
B
lock
ABlockBT
hread
C
{
unsigned
mMyThreadOffsetA
=
0
;
unsigned
mMyThreadOffsetB
=
0
;
...
...
@@ -241,7 +241,7 @@ struct blockwise_gemm_block_a_block_b_thread_c
unsigned
col_begin
;
};
__device__
b
lockwise
_g
emm
_b
lock
_a_block_b_t
hread
_c
()
__device__
B
lockwise
G
emm
B
lock
ABlockBT
hread
C
()
{
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
...
...
@@ -360,7 +360,7 @@ struct blockwise_gemm_block_a_block_b_thread_c
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
__device__
void
r
un
(
FloatA
*
const
p_a_block
,
__device__
void
R
un
(
FloatA
*
const
p_a_block
,
FloatB
*
const
p_b_block
,
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
const
...
...
src/include/gridwise_direct_convolution_1.cuh
View file @
6614729a
...
...
@@ -122,21 +122,21 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
#endif
constexpr
auto
blockwise_in_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
in_block_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
.
GetLengths
())
>
{};
constexpr
auto
blockwise_wei_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_block_global_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
.
GetLengths
())
>
{};
constexpr
auto
blockwise_out_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
out_block_desc
),
decltype
(
out_block_global_desc
),
...
...
@@ -149,14 +149,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
c_block_work_begin
+=
CPerBlock
)
{
// copy input tensor to LDS
blockwise_in_copy
.
r
un
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
blockwise_in_copy
.
R
un
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
p_in_block
);
// copy weight tensor to LDS
blockwise_wei_copy
.
r
un
(
blockwise_wei_copy
.
R
un
(
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
p_wei_block
);
...
...
@@ -179,7 +179,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
}
// copy output tensor from LDS to device mem
blockwise_out_copy
.
r
un
(
p_out_block
,
blockwise_out_copy
.
R
un
(
p_out_block
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
6614729a
...
...
@@ -145,14 +145,14 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
#endif
constexpr
auto
blockwise_in_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
in_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
.
GetLengths
())
>
{};
constexpr
auto
blockwise_wei_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_global_desc
),
decltype
(
wei_block_desc
),
...
...
@@ -165,14 +165,14 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
// copy input tensor to LDS
blockwise_in_copy
.
r
un
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
blockwise_in_copy
.
R
un
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
p_in_block
);
// copy weight tensor to LDS
blockwise_wei_copy
.
r
un
(
blockwise_wei_copy
.
R
un
(
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_block
);
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
6614729a
...
...
@@ -106,7 +106,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
// blockwise copy
// input: format is [C, Hi, Wi, N]
constexpr
auto
blockwise_in_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
in_chwn_global_desc
),
decltype
(
in_chwn_block_desc
),
...
...
@@ -114,7 +114,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
// weight: format is [S,R,C,K]
constexpr
auto
blockwise_wei_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_csrk_global_desc
),
decltype
(
wei_csrk_block_desc
),
...
...
@@ -140,7 +140,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_batch_gemm
=
b
lockwise
_
1d
_s
trided
_b
atched
_g
emm
_b
lock
_a_block_b_t
hread
_c
<
BlockSize
,
B
lockwise1d
S
trided
B
atched
G
emm
B
lock
ABlockBT
hread
C
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
...
...
@@ -149,8 +149,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
...
...
@@ -183,12 +182,12 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
{
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global_block_begin
,
p_in_block
);
blockwise_in_copy
.
R
un
(
p_in_global_block_begin
,
p_in_block
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_begin
,
p_wei_block
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_begin
,
p_wei_block
);
#endif
__syncthreads
();
...
...
@@ -200,7 +199,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
r
un
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
blockwise_batch_gemm
.
R
un
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
View file @
6614729a
...
...
@@ -136,7 +136,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
#endif
constexpr
auto
blockwise_in_copy
=
b
lockwise
_c
hwn
_t
ensor
_c
opy
_with_padding
<
BlockSize
,
B
lockwise
C
hwn
T
ensor
C
opy
Padded
<
BlockSize
,
Float
,
decltype
(
in_chwn_global_desc
),
decltype
(
in_chwn_block_desc
),
...
...
@@ -146,7 +146,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
#if 1
// weight: format is [C,S,R,K]
constexpr
auto
blockwise_wei_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_csrk_global_desc
),
decltype
(
wei_csrk_block_desc
),
...
...
@@ -154,15 +154,14 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
#elif 1
// weight: format is [C*S*R,K]
constexpr
auto
blockwise_wei_copy
=
b
lockwise
_
2d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise2d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
())
>
{};
#elif 1
// weight: format is [C*S*R,K]
const
auto
blockwise_wei_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
...
...
@@ -191,7 +190,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_batch_gemm
=
b
lockwise
_
1d
_s
trided
_b
atched
_g
emm
_b
lock
_a_block_b_t
hread
_c
<
BlockSize
,
B
lockwise1d
S
trided
B
atched
G
emm
B
lock
ABlockBT
hread
C
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
...
...
@@ -200,8 +199,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
...
...
@@ -229,7 +227,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
{
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global
,
blockwise_in_copy
.
R
un
(
p_in_global
,
c_block_data_begin
,
ho_block_data_begin
,
wo_block_data_begin
,
...
...
@@ -243,7 +241,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_begin
,
p_wei_block
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_begin
,
p_wei_block
);
#endif
__syncthreads
();
...
...
@@ -255,7 +253,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
r
un
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
blockwise_batch_gemm
.
R
un
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
View file @
6614729a
...
...
@@ -136,7 +136,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
#endif
constexpr
auto
blockwise_in_copy
=
b
lockwise
_c
hwn
_t
ensor
_c
opy
_with_padding
<
BlockSize
,
B
lockwise
C
hwn
T
ensor
C
opy
Padded
<
BlockSize
,
Float
,
decltype
(
in_chwn_global_desc
),
decltype
(
in_chwn_block_desc
),
...
...
@@ -146,7 +146,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
#if 0
// weight: format is [C,S,R,K]
constexpr auto blockwise_wei_copy =
b
lockwise
_
4d
_t
ensor
_c
opy
_
1<BlockSize,
B
lockwise4d
T
ensor
C
opy1<BlockSize,
Float,
decltype(wei_csrk_global_desc),
decltype(wei_csrk_block_desc),
...
...
@@ -154,15 +154,14 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
#elif
0
// weight: format is [C*S*R,K]
constexpr
auto
blockwise_wei_copy
=
b
lockwise
_
2d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise2d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
())
>
{};
#elif 1
// weight: format is [C*S*R,K]
const
auto
blockwise_wei_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
...
...
@@ -191,7 +190,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_batch_gemm
=
b
lockwise
_
1d
_s
trided
_b
atched
_g
emm
_b
lock
_a_block_b_t
hread
_c
<
BlockSize
,
B
lockwise1d
S
trided
B
atched
G
emm
B
lock
ABlockBT
hread
C
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
...
...
@@ -200,8 +199,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
...
...
@@ -229,7 +227,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
// prelog: load data
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global
,
blockwise_in_copy
.
R
un
(
p_in_global
,
0
,
ho_block_data_begin
,
wo_block_data_begin
,
...
...
@@ -241,7 +239,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
w_block_pad_up
);
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_begin
,
p_wei_block_0
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_begin
,
p_wei_block_0
);
p_wei_global_block_begin
+=
CPerBlock
*
wei_ek_global_desc
.
GetStride
(
I0
);
...
...
@@ -263,7 +261,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
// preload next data
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global
,
blockwise_in_copy
.
R
un
(
p_in_global
,
c_block_data_begin
,
ho_block_data_begin
,
wo_block_data_begin
,
...
...
@@ -277,7 +275,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_begin
,
p_wei_block_next
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_begin
,
p_wei_block_next
);
#endif
// a series of batched GEMM
...
...
@@ -287,7 +285,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
r
un
(
p_wei_block_now
+
blockwise_batch_gemm
.
R
un
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block_now
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
...
...
@@ -310,7 +308,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
r
un
(
p_wei_block_now
+
blockwise_batch_gemm
.
R
un
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block_now
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
View file @
6614729a
...
...
@@ -127,7 +127,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_batch_gemm
=
b
lockwise
_
1d
_s
trided
_b
atched
_g
emm
_b
lock
_a_block_b_t
hread
_c
<
BlockSize
,
B
lockwise1d
S
trided
B
atched
G
emm
B
lock
ABlockBT
hread
C
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
...
...
@@ -136,8 +136,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
...
...
@@ -175,7 +174,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
#else
// input: global mem to LDS,
// no format conversion, this is wrong, for performance study only!
b
lockwise
_
4d
_t
ensor
_c
opy
<
BlockSize
>
(
in_nchw_global_desc
,
B
lockwise4d
T
ensor
C
opy
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
...
...
@@ -200,7 +199,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
#else
// weight: global mem to LDS,
// no format conversion, this is wrong, for performance study only!
b
lockwise
_
4d
_t
ensor
_c
opy
<
BlockSize
>
(
B
lockwise4d
T
ensor
C
opy
<
BlockSize
>
(
wei_kcsr_global_desc
,
p_wei_global
+
wei_kcsr_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
...
...
@@ -219,7 +218,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_batch_gemm
.
r
un
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_batch_gemm
.
R
un
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
6614729a
...
...
@@ -109,7 +109,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
// blockwise copy
// wei: format is [S,R,C,K], no conversion needed
constexpr
auto
blockwise_wei_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
...
...
@@ -133,7 +133,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_batch_gemm
=
b
lockwise
_
1d
_s
trided
_b
atched
_g
emm
_b
lock
_a_block_b_t
hread
_c
<
BlockSize
,
B
lockwise1d
S
trided
B
atched
G
emm
B
lock
ABlockBT
hread
C
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxwn_block_mtx_desc
),
decltype
(
c_kxwn_thread_mtx_desc
),
...
...
@@ -142,8 +142,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
false
,
0
,
in_chwn_block_desc
.
GetStride
(
I1
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
out_hkwn_thread_desc
.
GetStride
(
I0
),
HoPerBlock
,
HoPerThread
,
CPerThread
,
...
...
@@ -183,7 +182,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
#if 1
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy
.
r
un
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
blockwise_wei_copy
.
R
un
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
#endif
...
...
@@ -197,7 +196,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_batch_gemm
.
r
un
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_batch_gemm
.
R
un
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_out_thread
,
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
6614729a
...
...
@@ -25,7 +25,9 @@ template <unsigned GridSize,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
>
unsigned
WeiBlockCopyThreadPerDim1
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
...
...
@@ -117,40 +119,52 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
b
lockwise
_
2d
_t
ensor
_c
opy
_
1<BlockSize,
B
lockwise2d
T
ensor
C
opy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif
1
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
b
lockwise
_
2d
_t
ensor
_c
opy
_
1<BlockSize,
B
lockwise2d
T
ensor
C
opy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif
1
const
auto
blockwise_wei_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
#elif
0
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
// a series of blockwise GEMM
...
...
@@ -170,8 +184,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_gemm
=
blockwise_gemm_block_a_block_b_thread_c
<
BlockSize
,
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
...
...
@@ -208,10 +221,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
__syncthreads
())
{
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global_block_offset
,
p_in_block
);
blockwise_in_copy
.
R
un
(
p_in_global_block_offset
,
p_in_block
);
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_offset
,
p_wei_block
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_offset
,
p_wei_block
);
__syncthreads
();
...
...
@@ -222,7 +235,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
r
un
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
R
un
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
...
...
@@ -283,10 +296,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
#endif
if
(
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
#if 1
p_out_global
[
out_knhw_global_desc
.
Get1dIndex
(
k_data
,
n_data
,
h_data
,
w_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get1dIndex
(
k
,
b
)];
#endif
}
}
}
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
View file @
6614729a
...
...
@@ -25,7 +25,9 @@ template <unsigned GridSize,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
>
unsigned
WeiBlockCopyThreadPerDim1
,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
...
...
@@ -117,40 +119,52 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
b
lockwise
_
2d
_t
ensor
_c
opy
_
1<BlockSize,
B
lockwise2d
T
ensor
C
opy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif
1
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyDataPerRead
>
{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
b
lockwise
_
2d
_t
ensor
_c
opy
_
1<BlockSize,
B
lockwise2d
T
ensor
C
opy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif
1
const
auto
blockwise_wei_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
#elif
0
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
>
{};
#elif 1
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_ek_global_desc
),
decltype
(
wei_ek_block_desc
),
decltype
(
wei_ek_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead
>
{};
#endif
// a series of blockwise GEMM
...
...
@@ -170,8 +184,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_gemm
=
blockwise_gemm_block_a_block_b_thread_c
<
BlockSize
,
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
...
...
@@ -205,10 +218,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
// prelog : preload data
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global_block_offset
,
p_in_block_0
);
blockwise_in_copy
.
R
un
(
p_in_global_block_offset
,
p_in_block_0
);
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_offset
,
p_wei_block_0
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_offset
,
p_wei_block_0
);
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
);
...
...
@@ -234,10 +247,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_1
:
p_wei_block_0
;
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global_block_offset
,
p_in_block_next
);
blockwise_in_copy
.
R
un
(
p_in_global_block_offset
,
p_in_block_next
);
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_offset
,
p_wei_block_next
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_offset
,
p_wei_block_next
);
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
...
...
@@ -246,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
r
un
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
R
un
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
...
...
@@ -268,7 +281,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
r
un
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
R
un
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
6614729a
...
...
@@ -110,14 +110,13 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 1
const
auto
blockwise_in_copy
=
b
lockwise
_
2d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise2d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
#elif 1
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
...
...
@@ -129,7 +128,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// blockwise wei copy
// format is [S,R,CPerBlock,KPerBlock]
const
auto
blockwise_wei_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
...
...
@@ -152,8 +151,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_gemm
=
blockwise_gemm_block_a_block_b_thread_c
<
BlockSize
,
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
...
...
@@ -191,12 +189,12 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
{
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global_block_offset
,
p_in_block
);
blockwise_in_copy
.
R
un
(
p_in_global_block_offset
,
p_in_block
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global_block_offset
,
p_wei_block
);
blockwise_wei_copy
.
R
un
(
p_wei_global_block_offset
,
p_wei_block
);
#endif
__syncthreads
();
...
...
@@ -209,7 +207,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
r
un
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
R
un
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
View file @
6614729a
...
...
@@ -110,14 +110,13 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 1
const
auto
blockwise_in_copy
=
b
lockwise
_
2d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise2d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
#elif 1
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
...
...
@@ -137,7 +136,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// format is [S,R,CPerBlock,KPerBlock]
#if 1
const
auto
blockwise_wei_copy
=
b
lockwise
_
4d
_t
ensor
_c
opy
_
1
<
BlockSize
,
B
lockwise4d
T
ensor
C
opy1
<
BlockSize
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
...
...
@@ -168,8 +167,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_gemm
=
blockwise_gemm_block_a_block_b_thread_c
<
BlockSize
,
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
...
...
@@ -201,13 +199,13 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// prelog: load data
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
),
blockwise_in_copy
.
R
un
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
),
p_in_block_0
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
blockwise_wei_copy
.
R
un
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
),
p_wei_block_0
);
#endif
...
...
@@ -227,14 +225,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#if 1
// preload next data
// input: global mem to LDS,
blockwise_in_copy
.
r
un
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
blockwise_in_copy
.
R
un
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
p_in_block_next
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
r
un
(
p_wei_global
+
blockwise_wei_copy
.
R
un
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
+
CPerBlock
,
k_block_data_begin
),
p_wei_block_next
);
...
...
@@ -247,7 +245,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
r
un
(
p_wei_block_now
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
R
un
(
p_wei_block_now
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
...
...
@@ -269,7 +267,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
r
un
(
p_wei_block_now
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
R
un
(
p_wei_block_now
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment