Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
abf75ac0
Commit
abf75ac0
authored
Feb 11, 2019
by
Chao Liu
Browse files
refactor
parent
120ab94a
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
148 additions
and
154 deletions
+148
-154
driver/conv.cu
driver/conv.cu
+9
-3
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+14
-2
src/include/ConstantMatrixDescriptor.cuh
src/include/ConstantMatrixDescriptor.cuh
+1
-1
src/include/blockwise_gemm.cuh
src/include/blockwise_gemm.cuh
+36
-40
src/include/common.cuh
src/include/common.cuh
+8
-10
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
...e/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+10
-12
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
...ise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
+10
-12
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
...gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
+10
-12
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
...nclude/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
+8
-8
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+8
-8
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+11
-15
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
...t_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
+8
-12
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+6
-8
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
+6
-8
src/include/threadwise_gemm.cuh
src/include/threadwise_gemm.cuh
+3
-3
No files found.
driver/conv.cu
View file @
abf75ac0
...
...
@@ -391,7 +391,7 @@ int main()
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif
0
#elif
1
// 3x3, 34x34
constexpr
unsigned
N
=
64
;
constexpr
unsigned
C
=
256
;
...
...
@@ -430,6 +430,9 @@ int main()
constexpr
unsigned
K
=
64
;
constexpr
unsigned
S
=
5
;
constexpr
unsigned
R
=
5
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#elif 0
// 7x7, 38x38
constexpr
unsigned
N
=
64
;
...
...
@@ -439,6 +442,9 @@ int main()
constexpr
unsigned
K
=
64
;
constexpr
unsigned
S
=
7
;
constexpr
unsigned
R
=
7
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#elif 0
// 3x3, 58x58
constexpr
unsigned
N
=
16
;
...
...
@@ -484,7 +490,7 @@ int main()
constexpr
unsigned
HPad
=
1
;
constexpr
unsigned
WPad
=
1
;
#elif
1
#elif
0
// 1x1 filter, 28x28 image
constexpr
unsigned
N
=
16
;
constexpr
unsigned
C
=
256
;
...
...
@@ -608,7 +614,7 @@ int main()
nrepeat
);
#endif
#if
0
#if
1
if
(
S
==
3
&&
R
==
3
)
{
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
...
...
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
abf75ac0
...
...
@@ -137,12 +137,18 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
WeiBlockCopyThreadPerDim1
=
32
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
// not used, yet
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#elif
0
#elif
1
// for 7x7, 38x38
constexpr
unsigned
NPerBlock
=
8
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
CPerBlock
=
1
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
4
;
...
...
@@ -152,6 +158,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WeiBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
WeiBlockCopyThreadPerDim1
=
32
;
constexpr
unsigned
InBlockCopyDataPerRead
=
4
;
// not used, yet
constexpr
unsigned
WeiBlockCopyDataPerRead
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
// for 3x3, 56x56
...
...
src/include/ConstantMatrixDescriptor.cuh
View file @
abf75ac0
...
...
@@ -4,7 +4,7 @@
template
<
unsigned
NRow_
,
unsigned
NCol_
,
unsigned
RowStride_
>
struct
ConstantMatrixDescriptor
{
__host__
__device__
ConstantMatrixDescriptor
()
__host__
__device__
constexpr
ConstantMatrixDescriptor
()
{
static_assert
(
NCol_
<=
RowStride_
,
"wrong! NCol > RowStride!"
);
}
...
...
src/include/blockwise_gemm.cuh
View file @
abf75ac0
...
...
@@ -124,12 +124,12 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
constexpr
auto
True
=
C
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
C
onstant
<
bool
,
false
>
{};
constexpr
auto
True
=
integral_c
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_c
onstant
<
bool
,
false
>
{};
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
const
auto
c_thread_mtx
=
ThreadMatrixC
{};
// constexpr doesn't compile
const
expr
auto
a_block_mtx
=
BlockMatrixA
{};
const
expr
auto
b_block_mtx
=
BlockMatrixB
{};
const
expr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
unsigned
KPerBlock
=
a_block_mtx
.
NRow
();
// A is transposed
...
...
@@ -137,11 +137,11 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
constexpr
unsigned
NPerThread
=
c_thread_mtx
.
NCol
();
// a is transposed, b is not
const
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
const
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpace
()];
FloatB
p_b_thread
[
b_thread_mtx
.
GetElementSpace
()];
...
...
@@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
const
expr
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
expr
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
static_assert
(
a_block_mtx
.
NRow
()
==
b_block_mtx
.
NRow
(),
"wrong! k dimension not consistent!"
);
...
...
@@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr
unsigned
MPerBlock
=
a_block_mtx
.
NCol
();
constexpr
unsigned
NPerBlock
=
b_block_mtx
.
NCol
();
const
auto
c_thread_mtx
=
ThreadMatrixC
{};
// constexpr doesn't compile
const
expr
auto
c_thread_mtx
=
ThreadMatrixC
{};
// constexpr doesn't compile
// divide thread work
constexpr
unsigned
MPerThread
=
c_thread_mtx
.
NRow
();
...
...
@@ -374,12 +374,12 @@ struct BlockwiseGemmBlockABlockBThreadC
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
constexpr
auto
True
=
C
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
C
onstant
<
bool
,
false
>
{};
constexpr
auto
True
=
integral_c
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_c
onstant
<
bool
,
false
>
{};
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
const
auto
c_thread_mtx
=
ThreadMatrixC
{};
// constexpr doesn't compile
const
expr
auto
a_block_mtx
=
BlockMatrixA
{};
const
expr
auto
b_block_mtx
=
BlockMatrixB
{};
const
expr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
unsigned
KPerBlock
=
a_block_mtx
.
NRow
();
// A is transposed
...
...
@@ -387,11 +387,11 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr
unsigned
NPerThread
=
c_thread_mtx
.
NCol
();
// a is transposed, b is not
const
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
const
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpace
()];
FloatB
p_b_thread
[
b_thread_mtx
.
GetElementSpace
()];
...
...
@@ -556,8 +556,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
const
{
constexpr
auto
True
=
C
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
C
onstant
<
bool
,
false
>
{};
constexpr
auto
True
=
integral_c
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_c
onstant
<
bool
,
false
>
{};
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
...
...
@@ -648,12 +648,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
const
{
constexpr
auto
True
=
C
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
C
onstant
<
bool
,
false
>
{};
constexpr
auto
True
=
integral_c
onstant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_c
onstant
<
bool
,
false
>
{};
const
auto
a_block_mtx
=
BlockMatrixA
{};
// constexpr doesn't compile
const
auto
b_block_mtx
=
BlockMatrixB
{};
// constexpr doesn't compile
const
auto
c_thread_mtx
=
ThreadMatrixC
{};
// constexpr doesn't compile
const
expr
auto
a_block_mtx
=
BlockMatrixA
{};
const
expr
auto
b_block_mtx
=
BlockMatrixB
{};
const
expr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
unsigned
M
=
a_block_mtx
.
NCol
();
constexpr
unsigned
N
=
b_block_mtx
.
NCol
();
...
...
@@ -663,22 +663,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr
unsigned
NPerThread
=
c_thread_mtx
.
NCol
();
// thread A, B for GEMM
const
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
const
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
// thread A-sub, B-sub for copy
const
auto
a_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThreadSubC
>
{},
Number
<
MPerThread
>
{});
// constexpr doesn't compile
constexpr
auto
a_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThreadSubC
>
{},
Number
<
MPerThread
>
{});
const
auto
b_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThreadSubC
>
{},
Number
<
NPerThread
>
{});
// constexpr doesn't compile
constexpr
auto
b_thread_sub_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThreadSubC
>
{},
Number
<
NPerThread
>
{});
// register
FloatA
p_a_thread_0
[
a_thread_mtx
.
GetElementSpace
()];
...
...
src/include/common.cuh
View file @
abf75ac0
#pragma once
#define WARPSIZE 32;
__device__
unsigned
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
unsigned
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
template
<
class
T1
,
class
T2
>
struct
is_same
...
...
@@ -14,20 +16,16 @@ struct is_same<T, T>
static
const
bool
value
=
true
;
};
__device__
unsigned
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
unsigned
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
template
<
class
T
,
T
N
>
struct
C
onstant
struct
integral_c
onstant
{
static
const
T
mV
alue
=
N
;
static
const
T
v
alue
=
N
;
__host__
__device__
constexpr
T
Get
()
const
{
return
mV
alue
;
}
__host__
__device__
constexpr
T
Get
()
const
{
return
v
alue
;
}
};
template
<
unsigned
N
>
using
Number
=
C
onstant
<
unsigned
,
N
>
;
using
Number
=
integral_c
onstant
<
unsigned
,
N
>
;
template
<
unsigned
...
Is
>
struct
Sequence
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
abf75ac0
...
...
@@ -156,18 +156,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
const
expr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
View file @
abf75ac0
...
...
@@ -176,18 +176,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
// A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
const
expr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
View file @
abf75ac0
...
...
@@ -176,18 +176,16 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
const
expr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
View file @
abf75ac0
...
...
@@ -115,16 +115,16 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
const
expr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
const
expr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
abf75ac0
...
...
@@ -121,16 +121,16 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
const
expr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
const
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
const
expr
auto
b_cxwn_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
WoPerBlock
*
NPerBlock
>
{},
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
Number
<
in_chwn_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxwn_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
WoPerThread
*
NPerThread
>
{});
const
auto
blockwise_batch_gemm
=
Blockwise1dStridedBatchedGemmBlockABlockBThreadC
<
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
abf75ac0
...
...
@@ -172,18 +172,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
#if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
...
...
@@ -258,7 +254,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
#if
0
#if
1
blockwise_gemm
.
Run
#else
blockwise_gemm
.
Run_RegisterDoubleBuffer
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
View file @
abf75ac0
...
...
@@ -172,18 +172,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
constexpr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
#if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
abf75ac0
...
...
@@ -140,16 +140,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
const
expr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
View file @
abf75ac0
...
...
@@ -156,16 +156,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
const
expr
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
constexpr
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
expr
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadC
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
...
...
src/include/threadwise_gemm.cuh
View file @
abf75ac0
...
...
@@ -30,13 +30,13 @@ template <class MatrixA,
class
FloatC
,
class
Accumulator
>
__device__
void
threadwise_gemm
(
MatrixA
,
C
onstant
<
bool
,
TransA
>
,
integral_c
onstant
<
bool
,
TransA
>
,
FloatA
*
const
p_a_thread
,
MatrixB
,
C
onstant
<
bool
,
TransB
>
,
integral_c
onstant
<
bool
,
TransB
>
,
FloatB
*
const
p_b_thread
,
MatrixC
,
C
onstant
<
bool
,
TransC
>
,
integral_c
onstant
<
bool
,
TransC
>
,
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment