Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
42f4c7fd
Commit
42f4c7fd
authored
Feb 05, 2019
by
Chao Liu
Browse files
refactor
parent
6614729a
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
63 additions
and
31 deletions
+63
-31
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+5
-5
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+10
-10
src/include/blockwise_2d_tensor_op.cuh
src/include/blockwise_2d_tensor_op.cuh
+32
-0
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+4
-4
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
+4
-4
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+4
-4
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
+4
-4
No files found.
driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
42f4c7fd
...
...
@@ -77,8 +77,8 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
Gemm
Row
ThreadPerCluster
=
4
;
constexpr
unsigned
Gemm
Column
ThreadPerCluster
=
8
;
constexpr
unsigned
GemmThreadPerC
olumnPerC
luster
=
4
;
constexpr
unsigned
GemmThreadPer
RowPer
Cluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
4
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
...
...
@@ -120,7 +120,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
#if 1
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
#el
se
#el
if 0
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
#endif
<
GridSize
,
...
...
@@ -135,8 +135,8 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
BPerThread
,
KPerThread
,
CPerThread
,
Gemm
Row
ThreadPerCluster
,
Gemm
Column
ThreadPerCluster
,
GemmThreadPerC
olumnPerC
luster
,
GemmThread
PerRow
PerCluster
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
,
WeiBlockCopyThreadPerDim0
,
...
...
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
42f4c7fd
...
...
@@ -76,8 +76,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned GemmThreadPerCluster
Row
= 1;
constexpr unsigned GemmThreadPerCluster
Column
= 4;
constexpr unsigned GemmThreadPerC
olumnPerC
luster = 1;
constexpr unsigned GemmThreadPer
RowPer
Cluster = 4;
constexpr unsigned BlockSize = 32;
#elif
0
...
...
@@ -89,8 +89,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
GemmThreadPerCluster
Row
=
4
;
constexpr
unsigned
GemmThreadPerCluster
Column
=
4
;
constexpr
unsigned
GemmThreadPerC
olumnPerC
luster
=
4
;
constexpr
unsigned
GemmThreadPer
RowPer
Cluster
=
4
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
...
...
@@ -102,8 +102,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
Gemm
Row
ThreadPerCluster
=
4
;
constexpr
unsigned
Gemm
Column
ThreadPerCluster
=
4
;
constexpr
unsigned
GemmThreadPerC
olumnPerC
luster
=
4
;
constexpr
unsigned
GemmThreadPer
RowPer
Cluster
=
4
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
2
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
64
;
...
...
@@ -119,8 +119,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
Gemm
Row
ThreadPerCluster
=
8
;
constexpr
unsigned
Gemm
Column
ThreadPerCluster
=
8
;
constexpr
unsigned
GemmThreadPerC
olumnPerC
luster
=
8
;
constexpr
unsigned
GemmThreadPer
RowPer
Cluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
16
;
...
...
@@ -171,8 +171,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
BPerThread
,
KPerThread
,
CPerThread
,
Gemm
Row
ThreadPerCluster
,
Gemm
Column
ThreadPerCluster
,
GemmThreadPerC
olumnPerC
luster
,
GemmThread
PerRow
PerCluster
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
<<<
grid_dim
,
block_dim
>>>
(
in_cnhw_desc
,
...
...
src/include/blockwise_2d_tensor_op.cuh
View file @
42f4c7fd
...
...
@@ -449,5 +449,37 @@ struct Blockwise2dTensorCopy3
assert
(
false
);
}
}
if
(
has_tail_d0
)
{
constexpr
unsigned
tail_d0
=
L0
-
nloop_d0
*
thread_per_d0
;
if
(
get_thread_local_1d_id
()
<
tail_d0
*
thread_per_d1
)
{
if
(
DataPerRead
==
1
)
{
p_dst
[
mDstMyThreadOffset
+
nloop_d0
*
dst_loop_stride
]
=
p_src
[
mSrcMyThreadOffset
+
nloop_d0
*
src_loop_stride
];
}
else
if
(
DataPerRead
==
2
)
{
*
(
reinterpret_cast
<
Float2
*>
(
p_dst
+
mDstMyThreadOffset
+
nloop_d0
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
Float2
*>
(
p_src
+
mSrcMyThreadOffset
+
nloop_d0
*
src_loop_stride
));
}
else
if
(
DataPerRead
==
4
)
{
*
(
reinterpret_cast
<
Float4
*>
(
p_dst
+
mDstMyThreadOffset
+
nloop_d0
*
dst_loop_stride
))
=
*
(
reinterpret_cast
<
Float4
*>
(
p_src
+
mSrcMyThreadOffset
+
nloop_d0
*
src_loop_stride
));
}
else
{
assert
(
false
);
}
}
}
}
};
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
42f4c7fd
...
...
@@ -20,8 +20,8 @@ template <unsigned GridSize,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
GemmThreadPerCluster
Row
,
unsigned
GemmThreadPerCluster
Column
,
unsigned
GemmThreadPerC
olumnPerC
luster
,
unsigned
GemmThreadPer
RowPer
Cluster
,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
,
unsigned
WeiBlockCopyThreadPerDim0
,
...
...
@@ -192,8 +192,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
false
,
false
,
CPerThread
,
GemmThreadPerCluster
Row
,
GemmThreadPerCluster
Column
,
GemmThreadPerC
olumnPerC
luster
,
GemmThreadPer
RowPer
Cluster
,
true
>
{};
// LDS
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh
View file @
42f4c7fd
...
...
@@ -20,8 +20,8 @@ template <unsigned GridSize,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
GemmThreadPerCluster
Row
,
unsigned
GemmThreadPerCluster
Column
,
unsigned
GemmThreadPerC
olumnPerC
luster
,
unsigned
GemmThreadPer
RowPer
Cluster
,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
,
unsigned
WeiBlockCopyThreadPerDim0
,
...
...
@@ -192,8 +192,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
false
,
false
,
CPerThread
,
GemmThreadPerCluster
Row
,
GemmThreadPerCluster
Column
,
GemmThreadPerC
olumnPerC
luster
,
GemmThreadPer
RowPer
Cluster
,
true
>
{};
// LDS
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
42f4c7fd
...
...
@@ -20,8 +20,8 @@ template <unsigned GridSize,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
GemmThreadPerCluster
Row
,
unsigned
GemmThreadPerCluster
Column
,
unsigned
GemmThreadPerC
olumnPerC
luster
,
unsigned
GemmThreadPer
RowPer
Cluster
,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
>
__global__
void
...
...
@@ -159,8 +159,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
false
,
false
,
CPerThread
,
GemmThreadPerCluster
Row
,
GemmThreadPerCluster
Column
,
GemmThreadPerC
olumnPerC
luster
,
GemmThreadPer
RowPer
Cluster
,
true
>
{};
// LDS
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
View file @
42f4c7fd
...
...
@@ -20,8 +20,8 @@ template <unsigned GridSize,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
Gemm
Row
ThreadPerCluster
,
unsigned
Gemm
Column
ThreadPerCluster
,
unsigned
GemmThreadPerC
olumnPerC
luster
,
unsigned
GemmThread
PerRow
PerCluster
,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
(
...
...
@@ -175,8 +175,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
false
,
false
,
CPerThread
,
Gemm
Row
ThreadPerCluster
,
Gemm
Column
ThreadPerCluster
,
GemmThreadPerC
olumnPerC
luster
,
GemmThread
PerRow
PerCluster
,
true
>
{};
// LDS
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment