Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
0741a8ab
Commit
0741a8ab
authored
Feb 04, 2019
by
Chao Liu
Browse files
working on reducing index calculation...
parent
9bbe9073
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
112 additions
and
102 deletions
+112
-102
driver/conv.cu
driver/conv.cu
+5
-2
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+2
-2
src/include/blockwise_gemm.cuh
src/include/blockwise_gemm.cuh
+1
-72
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
...e/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+14
-11
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
...plicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
+1
-1
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
...nclude/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
+1
-1
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+1
-1
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+13
-11
src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
+1
-1
src/include/threadwise_gemm.cuh
src/include/threadwise_gemm.cuh
+73
-0
No files found.
driver/conv.cu
View file @
0741a8ab
...
@@ -396,6 +396,9 @@ int main()
...
@@ -396,6 +396,9 @@ int main()
constexpr
unsigned
K
=
64
;
constexpr
unsigned
K
=
64
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
R
=
3
;
constexpr
unsigned
R
=
3
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#elif 0
#elif 0
// 3x3, 56x56
// 3x3, 56x56
constexpr
unsigned
N
=
64
;
constexpr
unsigned
N
=
64
;
...
@@ -586,7 +589,7 @@ int main()
...
@@ -586,7 +589,7 @@ int main()
#endif
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
1
#elif
0
device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
(
in_nchw_desc
,
device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr_desc
,
...
@@ -598,7 +601,7 @@ int main()
...
@@ -598,7 +601,7 @@ int main()
nrepeat
);
nrepeat
);
#endif
#endif
#if
0
#if
1
if
(
S
==
3
&&
R
==
3
)
if
(
S
==
3
&&
R
==
3
)
{
{
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
...
...
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
0741a8ab
...
@@ -87,7 +87,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -87,7 +87,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr unsigned WoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 8;
constexpr unsigned BlockSize = 8;
#elif
0
#elif
1
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
...
@@ -162,7 +162,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -162,7 +162,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
// for 1x1, 28x28
// for 1x1, 28x28
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
128
;
...
...
src/include/gemm.cuh
→
src/include/
blockwise_
gemm.cuh
View file @
0741a8ab
#pragma once
#pragma once
#include "threadwise_gemm.cuh"
template
<
class
Float
,
class
SrcMatrix
,
class
DstMatrix
,
unsigned
NRow
,
unsigned
NCol
>
__device__
void
threadwise_matrix_copy
(
SrcMatrix
,
Float
*
const
p_src
,
DstMatrix
,
Float
*
p_dst
,
Sequence
<
NRow
,
NCol
>
)
{
const
auto
src_mtx
=
SrcMatrix
{};
// constexpr doesn't compile
const
auto
dst_mtx
=
DstMatrix
{};
// constexpr doesn't compile
for
(
unsigned
i
=
0
;
i
<
NRow
;
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
NCol
;
++
j
)
{
const
unsigned
src_index
=
src_mtx
.
Get1dIndex
(
i
,
j
);
const
unsigned
dst_index
=
dst_mtx
.
Get1dIndex
(
i
,
j
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
}
}
}
template
<
class
MatrixA
,
class
MatrixB
,
class
MatrixC
,
bool
TransA
,
bool
TransB
,
bool
TransC
,
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
__device__
void
threadwise_gemm
(
MatrixA
,
Constant
<
bool
,
TransA
>
,
FloatA
*
const
p_a_thread
,
MatrixB
,
Constant
<
bool
,
TransB
>
,
FloatB
*
const
p_b_thread
,
MatrixC
,
Constant
<
bool
,
TransC
>
,
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
const
auto
a_mtx
=
MatrixA
{};
// constexpr doesn't compile
const
auto
b_mtx
=
MatrixB
{};
// constexpr doesn't compile
const
auto
c_mtx
=
MatrixC
{};
// constexpr doesn't compile
constexpr
unsigned
M
=
c_mtx
.
NRow
();
constexpr
unsigned
N
=
c_mtx
.
NCol
();
constexpr
unsigned
K
=
a_mtx
.
NRow
();
// A is transposed
for
(
unsigned
i
=
0
;
i
<
M
;
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
N
;
++
j
)
{
for
(
unsigned
k
=
0
;
k
<
K
;
++
k
)
{
const
unsigned
aindex
=
a_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
unsigned
bindex
=
b_mtx
.
Get1dIndex
(
k
,
j
);
const
unsigned
cindex
=
c_mtx
.
Get1dIndex
(
i
,
j
);
f_accum
(
p_c_thread
[
cindex
],
p_a_thread
[
aindex
]
*
p_b_thread
[
bindex
]);
}
}
}
}
else
{
// not implemented
assert
(
false
);
}
}
template
<
unsigned
BlockSize
,
template
<
unsigned
BlockSize
,
class
BlockMatrixA
,
class
BlockMatrixA
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
0741a8ab
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "gemm.cuh"
#include "
blockwise_
gemm.cuh"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
@@ -169,23 +169,26 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
...
@@ -169,23 +169,26 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
Float
*
p_in_global_block_begin
=
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_csrk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_begin
+=
CPerBlock
*
in_chwn_global_desc
.
GetStride
(
I0
),
p_wei_global_block_begin
+=
CPerBlock
*
wei_csrk_global_desc
.
GetStride
(
I0
),
__syncthreads
())
{
{
#if 1
#if 1
// input: global mem to LDS,
// input: global mem to LDS,
blockwise_in_copy
.
run
(
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
c_block_data_begin
,
blockwise_in_copy
.
run
(
p_in_global_block_begin
,
p_in_block
);
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
),
p_in_block
);
#endif
#endif
#if 1
#if 1
// weight: global mem to LDS,
// weight: global mem to LDS,
blockwise_wei_copy
.
run
(
p_wei_global
+
wei_csrk_global_desc
.
Get1dIndex
(
blockwise_wei_copy
.
run
(
p_wei_global_block_begin
,
p_wei_block
);
c_block_data_begin
,
0
,
0
,
k_block_data_begin
),
p_wei_block
);
#endif
#endif
__syncthreads
();
__syncthreads
();
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
View file @
0741a8ab
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "gemm.cuh"
#include "
blockwise_
gemm.cuh"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
View file @
0741a8ab
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "gemm.cuh"
#include "
blockwise_
gemm.cuh"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
0741a8ab
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "gemm.cuh"
#include "
blockwise_
gemm.cuh"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
0741a8ab
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "gemm.cuh"
#include "
blockwise_
gemm.cuh"
// define B = flatten(N, Hi, Wi)
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
@@ -128,14 +128,12 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
...
@@ -128,14 +128,12 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// blockwise wei copy
// blockwise wei copy
// format is [S,R,CPerBlock,KPerBlock]
// format is [S,R,CPerBlock,KPerBlock]
#if 1
const
auto
blockwise_wei_copy
=
const
auto
blockwise_wei_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
decltype
(
wei_srck_block_desc
),
decltype
(
wei_srck_block_desc
.
GetLengths
())
>
{};
decltype
(
wei_srck_block_desc
.
GetLengths
())
>
{};
#endif
// a series of blockwise GEMM
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
...
@@ -180,21 +178,25 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
...
@@ -180,21 +178,25 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
Float
*
p_in_global_block_offset
=
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_cb_global_desc
.
GetStride
(
I0
),
p_wei_global_block_offset
+=
CPerBlock
*
wei_srck_global_desc
.
GetStride
(
I2
),
__syncthreads
())
{
{
#if 1
#if 1
// input: global mem to LDS,
// input: global mem to LDS,
blockwise_in_copy
.
run
(
blockwise_in_copy
.
run
(
p_in_global_block_offset
,
p_in_block
);
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
,
b_block_data_begin
),
p_in_block
);
#endif
#endif
#if 1
#if 1
// weight: global mem to LDS,
// weight: global mem to LDS,
blockwise_wei_copy
.
run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
blockwise_wei_copy
.
run
(
p_wei_global_block_offset
,
p_wei_block
);
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
#endif
#endif
__syncthreads
();
__syncthreads
();
...
...
src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
View file @
0741a8ab
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "gemm.cuh"
#include "
blockwise_
gemm.cuh"
// define B = N*Hi*Wi
// define B = N*Hi*Wi
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
...
src/include/threadwise_gemm.cuh
0 → 100644
View file @
0741a8ab
#pragma once
template
<
class
Float
,
class
SrcMatrix
,
class
DstMatrix
,
unsigned
NRow
,
unsigned
NCol
>
__device__
void
threadwise_matrix_copy
(
SrcMatrix
,
Float
*
const
p_src
,
DstMatrix
,
Float
*
p_dst
,
Sequence
<
NRow
,
NCol
>
)
{
const
auto
src_mtx
=
SrcMatrix
{};
// constexpr doesn't compile
const
auto
dst_mtx
=
DstMatrix
{};
// constexpr doesn't compile
for
(
unsigned
i
=
0
;
i
<
NRow
;
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
NCol
;
++
j
)
{
const
unsigned
src_index
=
src_mtx
.
Get1dIndex
(
i
,
j
);
const
unsigned
dst_index
=
dst_mtx
.
Get1dIndex
(
i
,
j
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
}
}
}
template
<
class
MatrixA
,
class
MatrixB
,
class
MatrixC
,
bool
TransA
,
bool
TransB
,
bool
TransC
,
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
__device__
void
threadwise_gemm
(
MatrixA
,
Constant
<
bool
,
TransA
>
,
FloatA
*
const
p_a_thread
,
MatrixB
,
Constant
<
bool
,
TransB
>
,
FloatB
*
const
p_b_thread
,
MatrixC
,
Constant
<
bool
,
TransC
>
,
FloatC
*
p_c_thread
,
Accumulator
f_accum
)
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
const
auto
a_mtx
=
MatrixA
{};
// constexpr doesn't compile
const
auto
b_mtx
=
MatrixB
{};
// constexpr doesn't compile
const
auto
c_mtx
=
MatrixC
{};
// constexpr doesn't compile
constexpr
unsigned
M
=
c_mtx
.
NRow
();
constexpr
unsigned
N
=
c_mtx
.
NCol
();
constexpr
unsigned
K
=
a_mtx
.
NRow
();
// A is transposed
for
(
unsigned
i
=
0
;
i
<
M
;
++
i
)
{
for
(
unsigned
j
=
0
;
j
<
N
;
++
j
)
{
for
(
unsigned
k
=
0
;
k
<
K
;
++
k
)
{
const
unsigned
aindex
=
a_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
unsigned
bindex
=
b_mtx
.
Get1dIndex
(
k
,
j
);
const
unsigned
cindex
=
c_mtx
.
Get1dIndex
(
i
,
j
);
f_accum
(
p_c_thread
[
cindex
],
p_a_thread
[
aindex
]
*
p_b_thread
[
bindex
]);
}
}
}
}
else
{
// not implemented
assert
(
false
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment