Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
079d63a7
Commit
079d63a7
authored
Feb 05, 2019
by
Chao Liu
Browse files
bug fixes
parent
42f4c7fd
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
151 additions
and
87 deletions
+151
-87
driver/conv.cu
driver/conv.cu
+2
-4
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+25
-2
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+20
-29
src/include/ConstantTensorDescriptor.cuh
src/include/ConstantTensorDescriptor.cuh
+25
-0
src/include/blockwise_2d_tensor_op.cuh
src/include/blockwise_2d_tensor_op.cuh
+43
-23
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
...ise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
+1
-1
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+33
-26
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
+1
-1
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+1
-1
No files found.
driver/conv.cu
View file @
079d63a7
...
@@ -389,8 +389,8 @@ int main()
...
@@ -389,8 +389,8 @@ int main()
constexpr unsigned S = 3;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
constexpr unsigned R = 3;
constexpr unsigned HPad =
1
;
constexpr unsigned HPad =
0
;
constexpr unsigned WPad =
1
;
constexpr unsigned WPad =
0
;
#elif
0
#elif
0
// 3x3, 34x34
// 3x3, 34x34
constexpr
unsigned
N
=
64
;
constexpr
unsigned
N
=
64
;
...
@@ -593,8 +593,6 @@ int main()
...
@@ -593,8 +593,6 @@ int main()
device_implicit_gemm_convolution_2_cnhw_srck_knhw
device_implicit_gemm_convolution_2_cnhw_srck_knhw
#elif 1
#elif 1
device_implicit_gemm_convolution_2_cnhw_csrk_knhw
device_implicit_gemm_convolution_2_cnhw_csrk_knhw
#elif 0
device_winograd_convolution
#endif
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
...
...
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
079d63a7
...
@@ -67,7 +67,30 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
...
@@ -67,7 +67,30 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
Tensor
<
T
>
out_knhw
(
make_TensorDescriptor
(
out_knhw_desc
));
Tensor
<
T
>
out_knhw
(
make_TensorDescriptor
(
out_knhw_desc
));
#if 1
#if 0
// 3x3, 34x34
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned BPerThread = 4;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 4;
constexpr unsigned GemmThreadPerRowPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128;
#elif
1
// 1x1, 28x28
// 1x1, 28x28
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
...
@@ -120,7 +143,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
...
@@ -120,7 +143,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
#if 1
#if 1
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
#elif
0
#elif
1
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
#endif
#endif
<
GridSize
,
<
GridSize
,
...
...
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
079d63a7
...
@@ -68,64 +68,55 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
...
@@ -68,64 +68,55 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
Tensor
<
T
>
out_knhw
(
make_TensorDescriptor
(
out_knhw_desc
));
Tensor
<
T
>
out_knhw
(
make_TensorDescriptor
(
out_knhw_desc
));
#if 0
#if 0
constexpr unsigned BPerBlock =
256
;
constexpr unsigned BPerBlock =
128
;
constexpr unsigned KPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned BPerThread =
8
;
constexpr unsigned BPerThread =
4
;
constexpr unsigned KPerThread = 1;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 1;
constexpr unsigned GemmThreadPerRowPerCluster = 4;
constexpr unsigned GemmThreadPerRowPerCluster = 1;
constexpr unsigned BlockSize = 32;
#elif
0
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
BPerThread
=
8
;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr
unsigned
KPerThread
=
8
;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
4
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
4
;
constexpr
unsigned
BlockSize
=
128
;
constexpr unsigned BlockSize = 32;
#elif 0
#elif
1
// 3x3, 34x34
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
BPerThread
=
8
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
4
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
4
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
4
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
2
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
6
4
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
1
6
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
#elif 1
// 1x1, 28x28
// 1x1, 28x28
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
8
;
constexpr
unsigned
GemmThreadPerColumnPerCluster
=
4
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
8
;
constexpr
unsigned
GemmThreadPerRowPerCluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
64
;
#endif
#endif
constexpr
unsigned
GridSize
=
constexpr
unsigned
GridSize
=
...
...
src/include/ConstantTensorDescriptor.cuh
View file @
079d63a7
...
@@ -15,6 +15,24 @@ __host__ __device__ constexpr auto calculate_default_strides(Sequence<L0, L1, L2
...
@@ -15,6 +15,24 @@ __host__ __device__ constexpr auto calculate_default_strides(Sequence<L0, L1, L2
return
Sequence
<
L1
*
L2
*
L3
,
L2
*
L3
,
L3
,
1
>
{};
return
Sequence
<
L1
*
L2
*
L3
,
L2
*
L3
,
L3
,
1
>
{};
}
}
// this is ugly, only for 2d
template
<
unsigned
L0
,
unsigned
L1
,
unsigned
Align
>
__host__
__device__
constexpr
auto
calculate_default_strides_aligned
(
Sequence
<
L0
,
L1
>
,
Number
<
Align
>
)
{
constexpr
unsigned
L1_align
=
Align
*
((
L1
+
Align
-
1
)
/
Align
);
return
Sequence
<
L1_align
,
1
>
{};
}
// this is ugly, only for 4d
template
<
unsigned
L0
,
unsigned
L1
,
unsigned
L2
,
unsigned
L3
,
unsigned
Align
>
__host__
__device__
constexpr
auto
calculate_default_strides_aligned
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
,
Number
<
Align
>
)
{
constexpr
unsigned
L3_align
=
Align
*
((
L3
+
Align
-
1
)
/
Align
);
return
Sequence
<
L1
*
L2
*
L3_align
,
L2
*
L3_align
,
L3_align
,
1
>
{};
}
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
unsigned
S0
,
unsigned
S1
,
unsigned
S2
,
unsigned
S3
>
template
<
unsigned
S0
,
unsigned
S1
,
unsigned
S2
,
unsigned
S3
>
__host__
__device__
constexpr
auto
calculate_full_lengths
(
Sequence
<
S0
,
S1
,
S2
,
S3
>
)
__host__
__device__
constexpr
auto
calculate_full_lengths
(
Sequence
<
S0
,
S1
,
S2
,
S3
>
)
...
@@ -170,6 +188,13 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
...
@@ -170,6 +188,13 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
}
template
<
class
Lengths
,
unsigned
Align
>
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_aligned
(
Lengths
,
Number
<
Align
>
)
{
using
Strides
=
decltype
(
calculate_default_strides_aligned
(
Lengths
{},
Number
<
Align
>
{}));
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
template
<
class
TDesc
>
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
{
{
...
...
src/include/blockwise_2d_tensor_op.cuh
View file @
079d63a7
...
@@ -203,6 +203,9 @@ struct Blockwise2dTensorCopy2
...
@@ -203,6 +203,9 @@ struct Blockwise2dTensorCopy2
{
{
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float!
\n
"
);
using
Float4
=
float4
;
using
Float2
=
float2
;
if
(
get_thread_local_1d_id
()
>=
ThreadPerDim0
*
ThreadPerDim1
)
if
(
get_thread_local_1d_id
()
>=
ThreadPerDim0
*
ThreadPerDim1
)
return
;
return
;
...
@@ -212,18 +215,28 @@ struct Blockwise2dTensorCopy2
...
@@ -212,18 +215,28 @@ struct Blockwise2dTensorCopy2
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
// check alignment
constexpr
bool
align_v4
=
src_desc
.
GetStride
(
I0
)
%
4
==
0
&&
dst_desc
.
GetStride
(
I0
)
%
4
==
0
;
constexpr
bool
align_v2
=
src_desc
.
GetStride
(
I0
)
%
2
==
0
&&
dst_desc
.
GetStride
(
I0
)
%
2
==
0
;
constexpr
unsigned
L0
=
SrcOpLengths
{}.
Get
(
I0
);
constexpr
unsigned
L0
=
SrcOpLengths
{}.
Get
(
I0
);
constexpr
unsigned
L1
=
SrcOpLengths
{}.
Get
(
I1
);
constexpr
unsigned
L1
=
SrcOpLengths
{}.
Get
(
I1
);
constexpr
unsigned
Dim0Loop
=
L0
/
ThreadPerDim0
;
constexpr
unsigned
Dim0Loop
=
L0
/
ThreadPerDim0
;
constexpr
bool
d0_has_tail
=
(
L0
>
ThreadPerDim0
*
Dim0Loop
);
constexpr
bool
d0_has_tail
=
(
L0
>
ThreadPerDim0
*
Dim0Loop
);
constexpr
unsigned
Dim1V4Loop
=
L1
/
(
ThreadPerDim1
*
4
);
constexpr
unsigned
Dim1V4Loop
=
align_v4
?
L1
/
(
ThreadPerDim1
*
4
)
:
0
;
constexpr
unsigned
Dim1V2Loop
=
constexpr
unsigned
Dim1V2Loop
=
(
L1
-
Dim1V4Loop
*
(
ThreadPerDim1
*
4
))
/
(
ThreadPerDim1
*
2
);
align_v2
?
(
L1
-
Dim1V4Loop
*
(
ThreadPerDim1
*
4
))
/
(
ThreadPerDim1
*
2
)
:
0
;
constexpr
unsigned
Dim1V1Loop
=
constexpr
unsigned
Dim1V1Loop
=
(
L1
-
Dim1V4Loop
*
(
ThreadPerDim1
*
4
)
-
Dim1V2Loop
*
(
ThreadPerDim1
*
2
))
/
(
L1
-
Dim1V4Loop
*
(
ThreadPerDim1
*
4
)
-
Dim1V2Loop
*
(
ThreadPerDim1
*
2
))
/
ThreadPerDim1
;
ThreadPerDim1
;
constexpr
bool
d1_has_tail
=
constexpr
bool
d1_has_tail
=
(
L1
>
ThreadPerDim1
*
(
4
*
Dim1V4Loop
+
2
*
Dim1V2Loop
+
Dim1V1Loop
));
(
L1
>
ThreadPerDim1
*
(
4
*
Dim1V4Loop
+
2
*
Dim1V2Loop
+
Dim1V1Loop
));
...
@@ -239,8 +252,8 @@ struct Blockwise2dTensorCopy2
...
@@ -239,8 +252,8 @@ struct Blockwise2dTensorCopy2
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
f
loat4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
F
loat4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
f
loat4
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
F
loat4
*>
(
p_src
+
sindex
));
}
}
// v2
// v2
...
@@ -252,8 +265,8 @@ struct Blockwise2dTensorCopy2
...
@@ -252,8 +265,8 @@ struct Blockwise2dTensorCopy2
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
f
loat2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
F
loat2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
f
loat2
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
F
loat2
*>
(
p_src
+
sindex
));
}
}
// v1
// v1
...
@@ -300,8 +313,8 @@ struct Blockwise2dTensorCopy2
...
@@ -300,8 +313,8 @@ struct Blockwise2dTensorCopy2
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
f
loat4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
F
loat4
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
f
loat4
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
F
loat4
*>
(
p_src
+
sindex
));
}
}
// v2
// v2
...
@@ -313,8 +326,8 @@ struct Blockwise2dTensorCopy2
...
@@ -313,8 +326,8 @@ struct Blockwise2dTensorCopy2
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
*
(
reinterpret_cast
<
f
loat2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
F
loat2
*>
(
p_dst
+
dindex
))
=
*
(
reinterpret_cast
<
f
loat2
*>
(
p_src
+
sindex
));
*
(
reinterpret_cast
<
F
loat2
*>
(
p_src
+
sindex
));
}
}
// v1
// v1
...
@@ -356,7 +369,7 @@ template <unsigned BlockSize,
...
@@ -356,7 +369,7 @@ template <unsigned BlockSize,
class
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
SrcOp
Lengths
,
class
Copy
Lengths
,
unsigned
DataPerRead
>
unsigned
DataPerRead
>
struct
Blockwise2dTensorCopy3
struct
Blockwise2dTensorCopy3
{
{
...
@@ -374,16 +387,23 @@ struct Blockwise2dTensorCopy3
...
@@ -374,16 +387,23 @@ struct Blockwise2dTensorCopy3
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
static_assert
(
DataPerRead
==
1
||
DataPerRead
==
2
||
DataPerRead
==
4
,
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
"wrong! only support DataPerRead == 1, 2 or 4!
\n
"
);
constexpr
unsigned
L0
=
SrcOpLengths
{}.
Get
(
I0
);
static_assert
(
SrcDesc
{}.
GetStride
(
I0
)
%
DataPerRead
==
0
&&
constexpr
unsigned
L1
=
SrcOpLengths
{}.
Get
(
I1
);
DstDesc
{}.
GetStride
(
I0
)
%
DataPerRead
==
0
,
"src and dst stride should be multiple of DataPerRead to keep alignment"
);
static_assert
(
L1
%
DataPerRead
==
0
,
"wrong! only support mod(L1, DataPerRead) == 0
\n
"
);
constexpr
unsigned
L0
=
CopyLengths
{}.
Get
(
I0
);
constexpr
unsigned
L1
=
CopyLengths
{}.
Get
(
I1
);
constexpr
unsigned
thread_per_d1
=
L1
/
DataPerRead
;
constexpr
unsigned
thread_per_d1
=
(
L1
+
DataPerRead
-
1
)
/
DataPerRead
;
constexpr
unsigned
thread_per_d0
=
BlockSize
/
thread_per_d1
;
constexpr
unsigned
thread_per_d0
=
BlockSize
/
thread_per_d1
;
static_assert
(
thread_per_d1
<=
BlockSize
,
// we allow out-of-bound read from src in D1 dimension,
"wrong! not enough threads to cover L1 dimension
\n
"
);
// but we need to make sure dst stride is big enough,
// so that the out-of-bound write won't overwrite next line
static_assert
(
thread_per_d1
*
DataPerRead
<=
DstDesc
{}.
GetStride
(
I0
),
"wrong! out-of-bound write will overwrite next line!
\n
"
);
static_assert
(
thread_per_d0
>=
1
,
"wrong! not enough threads to cover L1 dimension
\n
"
);
const
unsigned
thread_id_d0
=
get_thread_local_1d_id
()
/
thread_per_d1
;
const
unsigned
thread_id_d0
=
get_thread_local_1d_id
()
/
thread_per_d1
;
const
unsigned
thread_id_d1
=
get_thread_local_1d_id
()
-
thread_id_d0
*
thread_per_d1
;
const
unsigned
thread_id_d1
=
get_thread_local_1d_id
()
-
thread_id_d0
*
thread_per_d1
;
...
@@ -402,17 +422,17 @@ struct Blockwise2dTensorCopy3
...
@@ -402,17 +422,17 @@ struct Blockwise2dTensorCopy3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
unsigned
L0
=
SrcOp
Lengths
{}.
Get
(
I0
);
constexpr
unsigned
L0
=
Copy
Lengths
{}.
Get
(
I0
);
constexpr
unsigned
L1
=
SrcOp
Lengths
{}.
Get
(
I1
);
constexpr
unsigned
L1
=
Copy
Lengths
{}.
Get
(
I1
);
constexpr
unsigned
thread_per_d1
=
L1
/
DataPerRead
;
constexpr
unsigned
thread_per_d1
=
(
L1
+
DataPerRead
-
1
)
/
DataPerRead
;
constexpr
unsigned
thread_per_d0
=
BlockSize
/
thread_per_d1
;
constexpr
unsigned
thread_per_d0
=
BlockSize
/
thread_per_d1
;
constexpr
unsigned
num_active_thread
=
thread_per_d0
*
thread_per_d1
;
constexpr
unsigned
num_active_thread
=
thread_per_d0
*
thread_per_d1
;
if
(
BlockSize
>
num_active_thread
)
if
(
BlockSize
>
num_active_thread
)
{
{
if
(
get_thread_local_1d_id
()
>
num_active_thread
)
if
(
get_thread_local_1d_id
()
>
=
num_active_thread
)
{
{
return
;
return
;
}
}
...
@@ -420,8 +440,6 @@ struct Blockwise2dTensorCopy3
...
@@ -420,8 +440,6 @@ struct Blockwise2dTensorCopy3
constexpr
unsigned
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
unsigned
nloop_d0
=
L0
/
thread_per_d0
;
constexpr
bool
has_tail_d0
=
(
L0
>
nloop_d0
*
thread_per_d0
);
constexpr
unsigned
src_loop_stride
=
SrcDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
unsigned
src_loop_stride
=
SrcDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
constexpr
unsigned
dst_loop_stride
=
DstDesc
{}.
GetStride
(
I0
)
*
thread_per_d0
;
...
@@ -450,6 +468,8 @@ struct Blockwise2dTensorCopy3
...
@@ -450,6 +468,8 @@ struct Blockwise2dTensorCopy3
}
}
}
}
constexpr
bool
has_tail_d0
=
(
L0
>
nloop_d0
*
thread_per_d0
);
if
(
has_tail_d0
)
if
(
has_tail_d0
)
{
{
constexpr
unsigned
tail_d0
=
L0
-
nloop_d0
*
thread_per_d0
;
constexpr
unsigned
tail_d0
=
L0
-
nloop_d0
*
thread_per_d0
;
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
View file @
079d63a7
...
@@ -173,7 +173,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
...
@@ -173,7 +173,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
// a series of blockwise batched GEMM
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,
C,
K]
// A_matrix[C,K] is a sub-matrix of wei_block[
C,
S,R,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
079d63a7
...
@@ -70,35 +70,32 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
...
@@ -70,35 +70,32 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
b_block_data_begin
=
b_block_work_id
*
BPerBlock
;
const
unsigned
b_block_data_begin
=
b_block_work_id
*
BPerBlock
;
#if 0
if(get_thread_local_1d_id() == 0)
{
printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead);
printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
KBlockWork,
BBlockWork,
k_block_data_begin,
b_block_data_begin);
}
#endif
// flattend (2d) tensor view of gridwise input
// flattend (2d) tensor view of gridwise input
constexpr
auto
in_cb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
B
>
{});
constexpr
auto
in_cb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
B
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
S
*
R
,
K
>
{});
constexpr
auto
wei_ek_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
*
S
*
R
,
K
>
{});
// tensor view of blockwise input and weight
// tensor view of blockwise input and weight
#if 0
constexpr auto in_cb_block_desc =
constexpr auto in_cb_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, BPerBlock + BGhostRead>{});
make_ConstantTensorDescriptor(Sequence<CPerBlock, BPerBlock + BGhostRead>{});
#else
constexpr
auto
in_cb_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
BPerBlock
+
BGhostRead
>
{},
Number
<
InBlockCopyDataPerRead
>
{});
#endif
#if 0
constexpr auto wei_ek_block_desc =
constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
constexpr auto wei_csrk_block_desc =
constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
#else
constexpr
auto
wei_ek_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
S
*
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
constexpr
auto
wei_csrk_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
S
,
R
,
KPerBlock
>
{},
Number
<
WeiBlockCopyDataPerRead
>
{});
#endif
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_kb_thread_desc
=
constexpr
auto
out_kb_thread_desc
=
...
@@ -107,8 +104,16 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
...
@@ -107,8 +104,16 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
#if 0
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
{
print_ConstantTensorDescriptor(in_cnhw_global_desc, "in_cnhw_global_desc");
print_ConstantTensorDescriptor(wei_csrk_global_desc, "wei_csrk_global_desc");
print_ConstantTensorDescriptor(out_knhw_global_desc, "out_knhw_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
printf("KPerBlock %u\n", KPerBlock);
...
@@ -120,10 +125,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
...
@@ -120,10 +125,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
#if 0
#if 0
const auto blockwise_in_copy =
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Blockwise2dTensorCopy1<BlockSize,
Float,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
Float
,
...
@@ -170,11 +175,13 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
...
@@ -170,11 +175,13 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
// a series of blockwise GEMM
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,
C,
K]
// a_mtx[C,K] is a sub-matrix of wei_block[
C,
S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
// c_mtx[K,B] is out_block[K,B]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_csrk_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
CPerBlock
>
{},
...
@@ -217,7 +224,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
...
@@ -217,7 +224,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_csrk_global_desc
.
GetStride
(
I
2
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_csrk_global_desc
.
GetStride
(
I
0
),
__syncthreads
())
__syncthreads
())
{
{
// input: global mem to LDS,
// input: global mem to LDS,
...
@@ -233,9 +240,9 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
...
@@ -233,9 +240,9 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
{
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
auto
f_accum
=
[](
auto
&
ac
c
,
const
auto
&&
v
)
{
ac
c
+=
v
;
};
blockwise_gemm
.
Run
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
Run
(
p_wei_block
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block
+
s
*
Wi
+
r
,
p_in_block
+
s
*
Wi
+
r
,
p_out_thread
,
p_out_thread
,
f_accum
);
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
View file @
079d63a7
...
@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
...
@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
{
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
Run
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
blockwise_gemm
.
Run
(
p_wei_block_now
+
wei_csrk_block_desc
.
Get1dIndex
(
0
,
s
,
r
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_in_block_now
+
s
*
Wi
+
r
,
p_out_thread
,
p_out_thread
,
f_accum
);
f_accum
);
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
079d63a7
...
@@ -108,7 +108,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
...
@@ -108,7 +108,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// blockwise in copy
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if
1
#if
0
const auto blockwise_in_copy =
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Blockwise2dTensorCopy1<BlockSize,
Float,
Float,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment