Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c9af4dec
"vscode:/vscode.git/clone" did not exist on "4c71470d8b204a09f010cf5ba38a2d489e01f73f"
Commit
c9af4dec
authored
Jan 24, 2019
by
Chao Liu
Browse files
implicit gemm: LDS double buffer
parent
1f3870ca
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
589 additions
and
138 deletions
+589
-138
driver/conv.cu
driver/conv.cu
+4
-4
driver/device_direct_convolution_1.cuh
driver/device_direct_convolution_1.cuh
+48
-37
driver/device_direct_convolution_2.cuh
driver/device_direct_convolution_2.cuh
+48
-37
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+1
-0
src/include/blockwise_2d_tensor_op.cuh
src/include/blockwise_2d_tensor_op.cuh
+47
-0
src/include/blockwise_4d_tensor_op.cuh
src/include/blockwise_4d_tensor_op.cuh
+37
-6
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+2
-2
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+33
-22
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+23
-16
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+23
-14
src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
+323
-0
No files found.
driver/conv.cu
View file @
c9af4dec
...
...
@@ -8,8 +8,8 @@
#include "conv_common.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
//
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
//
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
//#include "device_winograd_convolution.cuh"
...
...
@@ -410,7 +410,7 @@ int main()
wei_kcsr
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
unsigned
nrepeat
=
10
0
;
unsigned
nrepeat
=
5
0
;
#if 0
device_direct_convolution_1
...
...
@@ -427,7 +427,7 @@ int main()
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#if
0
#if
1
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#elif 0
...
...
driver/device_direct_convolution_1.cuh
View file @
c9af4dec
#pragma once
#include "gridwise_direct_convolution_1.cuh"
#include <unistd.h>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_1
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
void
device_direct_convolution_1
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
,
unsigned
nrepeat
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
...
...
@@ -65,6 +71,8 @@ void device_direct_convolution_1(
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
cudaEvent_t
start
,
stop
;
float
elapsedTime
;
...
...
@@ -101,6 +109,9 @@ void device_direct_convolution_1(
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
10000
);
}
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
driver/device_direct_convolution_2.cuh
View file @
c9af4dec
#pragma once
#include "gridwise_direct_convolution_2.cuh"
#include <unistd.h>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
void
device_direct_convolution_2
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
,
unsigned
nrepeat
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
...
...
@@ -79,6 +85,8 @@ void device_direct_convolution_2(
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
cudaEvent_t
start
,
stop
;
float
elapsedTime
;
...
...
@@ -115,6 +123,9 @@ void device_direct_convolution_2(
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
10000
);
}
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
c9af4dec
#pragma once
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh"
#include <unistd.h>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
src/include/blockwise_2d_tensor_op.cuh
View file @
c9af4dec
...
...
@@ -347,3 +347,50 @@ struct blockwise_2d_tensor_copy_2
}
}
};
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
blockwise_2d_tensor_copy_dummy_1
{
unsigned
mBegin
;
__device__
blockwise_2d_tensor_copy_dummy_1
()
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
mBegin
=
n_per_thread
*
get_thread_local_1d_id
();
}
__device__
void
run
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
for
(
unsigned
i
=
0
;
i
<
n_per_thread
;
++
i
)
{
p_dst
[
mBegin
+
i
]
=
p_src
[
mBegin
+
i
];
}
}
};
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
blockwise_2d_tensor_copy_dummy_2
{
__device__
void
run
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
for
(
unsigned
i
=
0
;
i
<
n_per_thread
;
++
i
)
{
unsigned
index
=
get_thread_local_1d_id
()
+
BlockSize
*
i
;
p_dst
[
index
]
=
p_src
[
index
];
}
}
};
src/include/blockwise_4d_tensor_op.cuh
View file @
c9af4dec
...
...
@@ -200,11 +200,42 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
}
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
__device__
void
blockwise_4d_tensor_copy
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
)
struct
blockwise_4d_tensor_copy_1
{
__device__
void
run
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
,
2
,
3
>
{};
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
dst_from_src_reorder
);
}
}
};
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
struct
blockwise_4d_tensor_copy_dummy
{
unsigned
mBegin
;
__device__
blockwise_4d_tensor_copy_dummy
()
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
mBegin
=
n_per_thread
*
get_thread_local_1d_id
();
}
__device__
void
run
(
Float
*
const
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
unsigned
n_total
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{}).
GetElementSpace
();
constexpr
unsigned
n_per_thread
=
n_total
/
BlockSize
;
for
(
unsigned
i
=
0
;
i
<
n_per_thread
;
++
i
)
{
p_dst
[
mBegin
+
i
]
=
p_src
[
mBegin
+
i
];
}
}
};
src/include/blockwise_direct_convolution.cuh
View file @
c9af4dec
...
...
@@ -101,7 +101,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
wo_thread_data_begin
),
out_thread_desc
,
p_out_thread
,
out_thread_desc
);
out_thread_desc
.
GetLengths
()
);
for
(
unsigned
c_thread_data_begin
=
0
;
c_thread_data_begin
<
in_block_desc
.
GetLength
(
I1
);
c_thread_data_begin
+=
CPerThread
)
...
...
@@ -128,6 +128,6 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
k_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
out_thread_desc
);
out_thread_desc
.
GetLengths
()
);
}
}
src/include/gridwise_direct_convolution_1.cuh
View file @
c9af4dec
...
...
@@ -121,6 +121,27 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
}
#endif
constexpr
auto
blockwise_in_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
in_block_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
.
GetLengths
())
>
{};
constexpr
auto
blockwise_wei_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
wei_block_global_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
.
GetLengths
())
>
{};
constexpr
auto
blockwise_out_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
out_block_desc
),
decltype
(
out_block_global_desc
),
decltype
(
out_block_desc
.
GetLengths
())
>
{};
// set output tensor in LDS to 0
blockwise_4d_tensor_set_zero
<
BlockSize
>
(
out_block_desc
,
p_out_block
);
...
...
@@ -128,23 +149,16 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
c_block_work_begin
+=
CPerBlock
)
{
// copy input tensor to LDS
blockwise_4d_tensor_copy
<
BlockSize
>
(
in_block_global_desc
,
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
blockwise_in_copy
.
run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
in_block_desc
,
p_in_block
,
in_block_desc
);
p_in_block
);
// copy weight tensor to LDS
blockwise_4d_tensor_copy
<
BlockSize
>
(
wei_block_global_desc
,
blockwise_wei_copy
.
run
(
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
wei_block_desc
,
p_wei_block
,
wei_block_desc
);
p_wei_block
);
__syncthreads
();
...
...
@@ -165,12 +179,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
}
// copy output tensor from LDS to device mem
blockwise_4d_tensor_copy
<
BlockSize
>
(
out_block_desc
,
p_out_block
,
out_block_global_desc
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
out_block_desc
);
blockwise_out_copy
.
run
(
p_out_block
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
));
}
src/include/gridwise_direct_convolution_2.cuh
View file @
c9af4dec
...
...
@@ -144,6 +144,20 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
wo_thread_data_begin);
#endif
constexpr
auto
blockwise_in_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
in_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
.
GetLengths
())
>
{};
constexpr
auto
blockwise_wei_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
wei_global_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
.
GetLengths
())
>
{};
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_thread_desc
,
p_out_thread
);
...
...
@@ -151,30 +165,23 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
// copy input tensor to LDS
blockwise_4d_tensor_copy
<
BlockSize
>
(
in_global_desc
,
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
blockwise_in_copy
.
run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
in_block_desc
,
p_in_block
,
in_block_desc
.
GetLengths
());
p_in_block
);
// copy weight tensor to LDS
blockwise_4d_tensor_copy
<
BlockSize
>
(
wei_global_desc
,
blockwise_wei_copy
.
run
(
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
wei_block_desc
,
p_wei_block
,
wei_block_desc
.
GetLengths
());
p_wei_block
);
__syncthreads
();
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
// threadwise convolution
#if
0
#if
1
threadwise_direct_convolution_2
(
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
...
...
@@ -185,7 +192,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_thread_desc
,
p_out_thread
);
#elif
1
#elif
0
threadwise_direct_convolution_3
(
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
c9af4dec
...
...
@@ -106,17 +106,17 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 1
// blockwise 2d copy
const
auto
blockwise_2d_copy
=
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
#elif 0
// blockwise 2d copy
const
auto
blockwise_2d_copy
=
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
...
...
@@ -126,6 +126,17 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
InBlockCopyThreadPerDim1
>
{};
#endif
// blockwise wei copy
// format is [S,R,CPerBlock,KPerBlock]
#if 1
const
auto
blockwise_wei_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
decltype
(
wei_srck_block_desc
.
GetLengths
())
>
{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
...
...
@@ -172,21 +183,19 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
#if 1
// input: global mem to LDS,
// formmat is [CPerBlock,BPerBlock + BGhostRead]
blockwise_2d_copy
.
run
(
blockwise_in_copy
.
run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
,
b_block_data_begin
),
p_in_block
);
#endif
#if 1
// weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock]
blockwise_4d_tensor_copy
<
BlockSize
>
(
wei_srck_global_desc
,
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
wei_srck_block_desc
,
p_wei_block
,
wei_srck_block_desc
.
GetLengths
());
blockwise_wei_copy
.
run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
#endif
__syncthreads
();
...
...
src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
0 → 100644
View file @
c9af4dec
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "gemm.cuh"
// define B = N*Hi*Wi
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
BPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
GemmThreadPerClusterRow
,
unsigned
GemmThreadPerClusterColumn
,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
>
__global__
void
gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_cnhw_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_srck_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_knhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_cnhw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
N
=
in_cnhw_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Hi
=
in_cnhw_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wi
=
in_cnhw_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
out_knhw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_knhw_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_knhw_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
S
=
wei_srck_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
R
=
wei_srck_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
B
=
N
*
Hi
*
Wi
;
constexpr
unsigned
BGhostRead
=
(
S
-
1
)
*
Wi
+
(
R
-
1
);
// divide block work by 2d: [K, B]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
BBlockWork
=
(
B
+
BPerBlock
-
1
)
/
BPerBlock
;
const
unsigned
k_block_work_id
=
get_block_1d_id
()
/
BBlockWork
;
const
unsigned
b_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
BBlockWork
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
b_block_data_begin
=
b_block_work_id
*
BPerBlock
;
#if 0
if(get_thread_local_1d_id() == 0)
{
printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead);
printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
KBlockWork,
BBlockWork,
k_block_data_begin,
b_block_data_begin);
}
#endif
// flattend (2d) tensor view of gridwise input
constexpr
auto
in_cb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
B
>
{});
// tensor view of blockwise input and weight
constexpr
auto
in_cb_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
BPerBlock
+
BGhostRead
>
{});
constexpr
auto
wei_srck_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
CPerBlock
,
KPerBlock
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_kb_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
BPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// in: global mem to LDS
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 1
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
#elif 0
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
()),
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
#elif 0
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_dummy_2
<
BlockSize
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
#endif
// weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock]
#if 1
const
auto
blockwise_wei_copy
=
blockwise_4d_tensor_copy_1
<
BlockSize
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
decltype
(
wei_srck_block_desc
.
GetLengths
())
>
{};
#else
const
auto
blockwise_wei_copy
=
blockwise_4d_tensor_copy_dummy
<
BlockSize
,
Float
,
decltype
(
wei_srck_global_desc
),
decltype
(
wei_srck_block_desc
),
decltype
(
wei_srck_block_desc
.
GetLengths
())
>
{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_gemm
=
blockwise_gemm_block_a_block_b_thread_c
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
true
,
false
,
false
,
CPerThread
,
GemmThreadPerClusterRow
,
GemmThreadPerClusterColumn
,
true
>
{};
// LDS
constexpr
unsigned
in_block_size
=
in_cb_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_srck_block_desc
.
GetElementSpace
();
// double buffer
__shared__
Float
p_in_block_0
[
in_block_size
];
__shared__
Float
p_wei_block_0
[
wei_block_size
];
__shared__
Float
p_in_block_1
[
in_block_size
];
__shared__
Float
p_wei_block_1
[
wei_block_size
];
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
// prelog: load data
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
),
p_in_block_0
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
),
p_wei_block_0
);
#endif
unsigned
cloop
=
0
;
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
+
CPerBlock
<
C
;
c_block_data_begin
+=
CPerBlock
,
++
cloop
)
{
__syncthreads
();
Float
*
p_in_block_now
=
(
cloop
%
2
==
0
)
?
p_in_block_0
:
p_in_block_1
;
Float
*
p_wei_block_now
=
(
cloop
%
2
==
0
)
?
p_wei_block_0
:
p_wei_block_1
;
Float
*
p_in_block_next
=
(
cloop
%
2
==
0
)
?
p_in_block_1
:
p_in_block_0
;
Float
*
p_wei_block_next
=
(
cloop
%
2
==
0
)
?
p_wei_block_1
:
p_wei_block_0
;
#if 1
// preload next data
// input: global mem to LDS,
blockwise_in_copy
.
run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
p_in_block_next
);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy
.
run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
+
CPerBlock
,
k_block_data_begin
),
p_wei_block_next
);
#endif
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
run
(
p_wei_block_now
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
}
}
}
{
// last cloop
__syncthreads
();
Float
*
p_in_block_now
=
(
cloop
%
2
==
0
)
?
p_in_block_0
:
p_in_block_1
;
Float
*
p_wei_block_now
=
(
cloop
%
2
==
0
)
?
p_wei_block_0
:
p_wei_block_1
;
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
run
(
p_wei_block_now
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block_now
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
}
}
}
// output: register to global mem,
const
auto
matrix_c_index
=
blockwise_gemm
.
CalculateThreadMatrixCIndex
(
get_thread_local_1d_id
());
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row_begin
;
const
unsigned
b_thread_data_begin
=
matrix_c_index
.
col_begin
;
const
unsigned
k_data_begin
=
k_block_data_begin
+
k_thread_data_begin
;
const
unsigned
b_data_begin
=
b_block_data_begin
+
b_thread_data_begin
;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.row_begin,
matrix_c_index.col_begin,
k_data_begin,
b_data_begin,
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
}
#endif
for
(
unsigned
k
=
0
;
k
<
out_kb_thread_desc
.
GetLength
(
I0
);
++
k
)
{
for
(
unsigned
b
=
0
;
b
<
out_kb_thread_desc
.
GetLength
(
I1
);
++
b
)
{
unsigned
k_data
=
k_data_begin
+
k
;
unsigned
b_data
=
b_data_begin
+
b
;
unsigned
n_data
=
b_data
/
(
Hi
*
Wi
);
unsigned
itmp
=
b_data
-
n_data
*
(
Hi
*
Wi
);
unsigned
h_data
=
itmp
/
Wi
;
unsigned
w_data
=
itmp
-
h_data
*
Wi
;
if
(
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
p_out_global
[
out_knhw_global_desc
.
Get1dIndex
(
k_data
,
n_data
,
h_data
,
w_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get1dIndex
(
k
,
b
)];
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment