Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b7d05245
Commit
b7d05245
authored
May 15, 2019
by
Chao Liu
Browse files
adding implicit gemm v3
parent
4957d5a3
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
397 additions
and
57 deletions
+397
-57
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+17
-17
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+1
-1
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+309
-0
src/include/tensor.hpp
src/include/tensor.hpp
+3
-3
src/include/threadwise_2d_tensor_op.hip.hpp
src/include/threadwise_2d_tensor_op.hip.hpp
+1
-1
src/include/threadwise_direct_convolution.hip.hpp
src/include/threadwise_direct_convolution.hip.hpp
+3
-3
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+17
-2
src/include/threadwise_tensor_slice_op.hip.hpp
src/include/threadwise_tensor_slice_op.hip.hpp
+45
-29
src/tensor.cpp
src/tensor.cpp
+1
-1
No files found.
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
View file @
b7d05245
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_
nd_
tensor_op.hip.hpp"
#include "blockwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp"
...
@@ -73,7 +73,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -73,7 +73,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
index_t
Y
=
wei_c_y_x_k_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_c_y_x_k_global_desc
.
GetLength
(
I1
);
constexpr
index_t
X
=
wei_c_y_x_k_global_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_c_y_x_k_global_desc
.
GetLength
(
I2
);
// divide block work: [K, Ho, Wo
, N
]
// divide block work: [
N,
K, Ho, Wo]
static_assert
(
N
%
NPerBlock
==
0
&&
K
%
KPerBlock
==
0
&&
C
%
CPerBlock
==
0
&&
static_assert
(
N
%
NPerBlock
==
0
&&
K
%
KPerBlock
==
0
&&
C
%
CPerBlock
==
0
&&
Ho
%
HoPerBlock
==
0
&&
Wo
%
WoPerBlock
==
0
,
Ho
%
HoPerBlock
==
0
&&
Wo
%
WoPerBlock
==
0
,
"wrong! cannot evenly divide work for workgroup "
);
"wrong! cannot evenly divide work for workgroup "
);
...
@@ -128,18 +128,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -128,18 +128,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
constexpr
auto
map_chwn2nchw
=
Sequence
<
1
,
2
,
3
,
0
>
{};
const
auto
blockwise_in_copy_reorder
=
const
auto
blockwise_in_copy_reorder
=
BlockwiseTensorSliceReorderCopy_v3
<
BlockwiseNdTensorCopyReorder_v3
<
BlockSize
,
BlockSize
,
Float
,
Float
,
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_n_c_h_w_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
decltype
(
in_c_h_w_n_block_desc
),
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
Sequence
<
NPerBlock
,
CPerBlock
,
HoPerBlock
,
WoPerBlock
>
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
InBlockReorderSrcClusterLengths_NCHW
,
decltype
(
map_chwn2nchw
),
decltype
(
map_chwn2nchw
),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerRead_W
,
InBlockReorderDataPerWrite_N
>
{};
InBlockReorderDataPerWrite_N
>
{};
// blockwise wei copy
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
// format is [CPerBlock, KPerBlock]
...
@@ -390,7 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -390,7 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
auto
map_out_global2thread
=
Sequence
<
7
,
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
>
{};
constexpr
auto
map_out_global2thread
=
Sequence
<
7
,
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
>
{};
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v2
(
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v2
(
out_10d_thread_desc
,
out_10d_thread_desc
,
p_out_thread
,
p_out_thread
,
out_10d_global_desc
,
out_10d_global_desc
,
...
@@ -439,7 +439,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -439,7 +439,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
auto
map_out_global2thread
=
Sequence
<
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
{};
constexpr
auto
map_out_global2thread
=
Sequence
<
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
{};
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v2
(
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v2
(
out_10d_thread_desc
,
out_10d_thread_desc
,
p_out_thread
,
p_out_thread
,
out_10d_global_desc
,
out_10d_global_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
b7d05245
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = flatten(N, Hi, Wi)
// define B = flatten(N, Hi, Wi)
...
...
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
0 → 100644
View file @
b7d05245
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = merge(N, Ho, Wo)
template
<
index_t
GridSize
,
index_t
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
index_t
BPerBlock
,
index_t
KPerBlock
,
index_t
CPerBlock
,
index_t
N1
,
index_t
N2
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMLevel0Cluster
,
index_t
GemmNLevel0Cluster
,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
GemmKPerThreadLoop
,
index_t
GemmDataPerReadA
,
index_t
GemmDataPerReadB
>
struct
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
// this is a mess
// TODO: more elegent way of specifying (or calculating) performance variables
static_assert
(
N2
==
GemmNPerThreadSubC
,
"wrong!"
);
static_assert
(
KPerBlock
==
N1
*
GemmNPerThreadSubC
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
,
"wrong!"
);
static_assert
(
KPerBlock
%
(
N1
*
GemmNPerThreadSubC
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
==
0
,
"wrong!"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_c_y_x_k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_n_k_h_w_global_desc
=
OutGlobalDesc
{};
constexpr
index_t
N
=
in_n_c_h_w_global_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
in_n_c_h_w_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Hi
=
in_n_c_h_w_global_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_n_c_h_w_global_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
out_n_k_h_w_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Ho
=
out_n_k_h_w_global_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_n_k_h_w_global_desc
.
GetLength
(
I3
);
constexpr
index_t
Y
=
wei_c_y_x_k_global_desc
.
GetLength
(
I1
);
constexpr
index_t
X
=
wei_c_y_x_k_global_desc
.
GetLength
(
I2
);
static_assert
(
N
%
(
N1
*
N2
)
==
0
,
"wrong! cannot divice N evenly among thread"
);
constexpr
index_t
N0
=
N
/
(
N1
*
N2
);
constexpr
index_t
B
=
N0
*
Ho
*
Wo
;
// divide block work by [K, B]
static_assert
(
K
%
KPerBlock
==
0
&&
B
%
BPerBlock
==
0
,
C
%
CPerBlock
==
0
,
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
// input tensor
// memory layout descriptor in device memory [N0, N1, N2, C, H, W]
constexpr
auto
in_n0_n1_n2_c_h_w_global_mem_desc
=
in_n_c_h_w_global_desc
.
Fold
(
I0
,
Sequence
<
N1
,
N2
>
{});
// merged tensor descriptor in device memory [N1, N2, C, B], src of blockwise copy
constexpr
auto
in_n1_n2_c_b_global_merged_desc
=
in_n0_n1_n2_c_h_w_global_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
2
,
3
,
0
,
4
,
5
>
{})
.
Slice
(
I4
,
Number
<
Ho
>
{})
.
Slice
(
I5
,
Number
<
Wo
>
{})
.
Merge
(
I3
,
I5
);
// memory layout descriptor in LDS [C, N1, B, N2]
// be careful of LDS alignment
constexpr
auto
in_c_n1_b_n2_block_mem_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
N1
,
BPerBlock
,
N2
>
{},
Number
<
InBlockCopyDstDataPerWrite_N2
>
{});
// tensor descriptor in LDS [N1, N2, C, B], dst of blockwise copy
constexpr
auto
in_n1_n2_c_b_block_desc
=
in_c_n1_b_n2_block_mem_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
3
,
0
,
2
>
{});
// this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment
static_assert
(
in_c_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not satisfied"
);
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
const
auto
blockwise_in_copy
=
BlockwiseTensorSliceCopy_generic_v1
<
BlockSize
,
Float
,
decltype
(
in_n1_n2_c_b_global_merged_desc
),
decltype
(
in_n1_n2_c_b_block_desc
),
Sequence
<
N1
,
N2
,
CPerBlock
,
BPerBlock
>
,
InBlockCopySubLengths_N1_N2_C_B
,
InBlockCopyClusterLengths_N1_N2_C_B
,
Sequence
<
2
,
0
,
1
,
3
>
,
// thread_arrange_order [C, N1, N2, B]
Sequence
<
0
,
1
,
2
,
3
>
,
// src_access_order [N1, N2, C, B]
Sequence
<
2
,
0
,
3
,
1
>
,
// dst_access_order [C, N1, B, N2]
>
({
0
,
0
,
0
,
b_block_data_on_global
},
{
0
,
0
,
0
,
0
});
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
constexpr
auto
wei_c_k_global_desc
=
wei_c_y_x_k_global_desc
.
Extract
(
Sequence
<
0
,
3
>
{});
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// operator for blockwise copy of weight into LDS
// slicing a tensor
// this copy operator already have tensor offset built-in
const
auto
blockwise_wei_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
WeiBlockCopyDataPerRead_K
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[CPerBlock, KPerBlock] is in LDS
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register
constexpr
auto
a_c_k_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_c_n1bn2_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
N1
*
BPerBlock
*
N2
>
{},
Number
<
in_c_n1_b_n2_block_mem_desc
.
GetStride
(
I0
)
>
{});
// sanity check
static_assert
(
KPerBlock
%
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
),
"wrong!"
);
constexpr
index_t
GemmMRepeat
=
KPerBlock
/
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
);
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k2_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N1
*
N2
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
decltype
(
a_c_k_block_mtx_desc
),
decltype
(
b_c_n1bn2_block_mtx_desc
),
decltype
(
c_k0k2_n1n2_thread_mtx_desc
),
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
>
{};
// LDS allocation for input and weight: be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
constexpr
index_t
in_block_space
=
in_c_n1_b_n2_block_mem_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
// register allocation for output
Float
p_out_thread
[
c_k0k2_n1n2_thread_mtx_desc
.
GetElementSpace
()];
// zero out threadwise output
threadwise_matrix_set_zero
(
out_k0_k1_k2_n1_n0_h_w_n2_thread_desc
,
p_out_thread
);
// do work
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
// calculate origin of block input and weight tensor on global memory
const
Float
*
p_in_block_on_global
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get1dIndex
(
0
,
0
,
y
,
x
);
const
Float
*
p_wei_block_on_global
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
);
for
(
index_t
c_block_data_on_global
=
0
;
c_block_data_on_global
<
C
;
c_block_data_on_global
+=
CPerBlock
,
p_in_block_ont_global
+=
CPerBlock
*
in_n_c_h_w_global_desc
.
GetStride
(
I1
),
p_wei_block_on_global
+=
CPerBlock
*
wei_c_y_x_k_global_desc
.
GetStride
(
I0
))
{
blockwise_in_copy
.
run
(
p_in_block_on_global
,
p_in_block
);
blockwise_wei_copy
.
run
(
p_wei_block_on_global
,
p_wei_block
);
__syncthreads
();
blockwise_gemm
.
run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
}
}
}
// copy output: register to global memory
{
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
GemmMLevel0Cluster
*
GemmMLevel1Cluster
;
constexpr
index_t
K0
=
K
/
(
K1
*
K2
);
// define tensor descriptor for threadwise copy
// output tensor (also, memory layout) descriptor in register, src of threadwise
// copy
constexpr
auto
out_k0_k1_k2_n1_b_n2_thread_mem_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N1
,
1
,
1
,
1
,
N2
>
{});
// output memory layout descriptor in device memory
constexpr
auto
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
=
out_n_k_h_w_global
.
Fold
(
I1
,
Sequence
<
K1
,
K2
>
{}).
Fold
(
I0
,
Sequence
<
N1
,
N2
>
{});
// output merged tensor descriptor in device memory, dst of threadwise copy
constexpr
auto
out_k0_k1_k2_n1_b_n2_global_merged_desc
=
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
ReorderGivenNew2Old
(
Sequence
<
3
,
4
,
5
,
1
,
0
,
6
,
7
,
2
>
{})
.
Merge
(
I4
,
I6
);
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
const
auto
c_thread_mtx_on_block
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
// origin of thread tensor on global
const
index_t
k_thread_data_on_global
k_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
const
index_t
b_thread_data_on_global
=
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
;
// output merged global tensor descriptor, for calculating origin of thread tensor
// in global memory
constexpr
auto
out_k_n1_b_n2_global_merged_desc
=
out_k0_k1_k2_n1_b_n2_global_merged_desc
.
Unfold
(
I1
,
I2
);
// origin of thread tensor in global memory
const
index_t
p_out_thread_on_global
=
p_out_global
+
out_k_n1_b_n2_global_merged_desc
.
Get1dIndex
(
k_thread_data_on_global
,
0
,
0
,
0
);
// dst origin on merged global tensor
// copy
threadwise_tensor_slice_copy_generic
(
out_k0_k1_k2_n1_b_n2_thread_mem_desc
,
// src thread tensor (in register) descriptor
p_out_thread
,
// origin of src
{
0
,
0
,
0
,
0
,
0
,
0
},
// starting point of slice, w.r.t. origin of src
out_k0_k1_k2_n1_b_n2_global_merged_desc
,
// dst global merged tensor (in device mem)
// descriptor
p_out_thread_on_global
,
// origin of dst
{
0
,
0
,
0
,
0
,
b_thread_data_on_global
,
0
},
// starting point of slice w.r.t. origin of dst
out_k0_k1_k2_n1_b_n2_thread_desc
.
GetLengths
(),
// slice lengths
Sequence
<
2
,
3
,
4
,
0
,
5
,
1
>
{}
// order of dimension access
);
}
}
};
src/include/tensor.hpp
View file @
b7d05245
...
@@ -85,7 +85,7 @@ struct TensorDescriptor
...
@@ -85,7 +85,7 @@ struct TensorDescriptor
{
{
}
}
std
::
size_t
GetDimension
()
const
;
std
::
size_t
Get
NumOf
Dimension
()
const
;
std
::
size_t
GetElementSize
()
const
;
std
::
size_t
GetElementSize
()
const
;
std
::
size_t
GetElementSpace
()
const
;
std
::
size_t
GetElementSpace
()
const
;
...
@@ -95,7 +95,7 @@ struct TensorDescriptor
...
@@ -95,7 +95,7 @@ struct TensorDescriptor
template
<
class
...
Is
>
template
<
class
...
Is
>
std
::
size_t
Get1dIndex
(
Is
...
is
)
const
std
::
size_t
Get1dIndex
(
Is
...
is
)
const
{
{
assert
(
sizeof
...(
Is
)
==
this
->
GetDimension
());
assert
(
sizeof
...(
Is
)
==
this
->
Get
NumOf
Dimension
());
std
::
initializer_list
<
std
::
size_t
>
iss
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
initializer_list
<
std
::
size_t
>
iss
{
static_cast
<
std
::
size_t
>
(
is
)...};
return
std
::
inner_product
(
iss
.
begin
(),
iss
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
return
std
::
inner_product
(
iss
.
begin
(),
iss
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
}
}
...
@@ -206,7 +206,7 @@ struct Tensor
...
@@ -206,7 +206,7 @@ struct Tensor
template
<
class
G
>
template
<
class
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
{
{
switch
(
mDesc
.
GetDimension
())
switch
(
mDesc
.
Get
NumOf
Dimension
())
{
{
case
1
:
case
1
:
{
{
...
...
src/include/threadwise_2d_tensor_op.hip.hpp
View file @
b7d05245
...
@@ -88,7 +88,7 @@ threadwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
...
@@ -88,7 +88,7 @@ threadwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
MapDst2Src
{},
f_copy
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
MapDst2Src
{},
f_copy
);
}
}
#if 0 // replaced threadwise_
nd_
tensor_copy
#if 0 // replaced threadwise_tensor_
slice_
copy
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
__device__ void threadwise_2d_tensor_copy(
__device__ void threadwise_2d_tensor_copy(
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
...
...
src/include/threadwise_direct_convolution.hip.hpp
View file @
b7d05245
#pragma once
#pragma once
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "threadwise_
nd_
tensor_op.hip.hpp"
#include "threadwise_tensor_
slice_
op.hip.hpp"
// optimized for scenario if p_in, p_wei, p_out are in register
// optimized for scenario if p_in, p_wei, p_out are in register
template
<
class
TInWei
,
class
TOut
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
TInWei
,
class
TOut
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
@@ -85,11 +85,11 @@ __device__ void threadwise_direct_convolution_2(InDesc,
...
@@ -85,11 +85,11 @@ __device__ void threadwise_direct_convolution_2(InDesc,
TInWei
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
TInWei
p_wei_reg
[
wei_reg_desc
.
GetElementSpace
()];
// copy input tensor into register
// copy input tensor into register
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
in_desc
,
p_in
,
in_reg_desc
,
p_in_reg
,
in_reg_desc
.
GetLengths
(),
Number
<
1
>
{});
in_desc
,
p_in
,
in_reg_desc
,
p_in_reg
,
in_reg_desc
.
GetLengths
(),
Number
<
1
>
{});
// copy input tensor into register
// copy input tensor into register
threadwise_
nd_
tensor_copy
(
threadwise_tensor_
slice_
copy
(
wei_desc
,
p_wei
,
wei_reg_desc
,
p_wei_reg
,
wei_reg_desc
.
GetLengths
(),
Number
<
1
>
{});
wei_desc
,
p_wei
,
wei_reg_desc
,
p_wei_reg
,
wei_reg_desc
.
GetLengths
(),
Number
<
1
>
{});
// do convolution
// do convolution
...
...
src/include/threadwise_gemm.hip.hpp
View file @
b7d05245
#pragma once
#pragma once
#include "common.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
template
<
class
Float
,
class
Matrix
>
__device__
void
threadwise_matrix_set_zero
(
Matrix
,
Float
*
__restrict__
p_thread
)
{
for
(
index_t
i
=
0
;
i
<
Matrix
::
NRow
();
++
i
)
{
for
(
index_t
j
=
0
;
j
<
Matrix
::
NCol
();
++
j
)
{
const
index_t
id
=
Matrix
::
Get1dIndex
(
i
,
j
);
p_thread
[
id
]
=
0
;
}
}
}
template
<
class
Float
,
template
<
class
Float
,
class
SrcMatrix
,
class
SrcMatrix
,
...
@@ -64,9 +79,9 @@ __device__ void threadwise_gemm(MatrixA,
...
@@ -64,9 +79,9 @@ __device__ void threadwise_gemm(MatrixA,
for
(
index_t
k
=
0
;
k
<
K
;
++
k
)
for
(
index_t
k
=
0
;
k
<
K
;
++
k
)
{
{
for
(
index_t
i
=
0
;
i
<
M
;
i
++
)
for
(
index_t
i
=
0
;
i
<
M
;
++
i
)
{
{
for
(
index_t
j
=
0
;
j
<
N
;
j
++
)
for
(
index_t
j
=
0
;
j
<
N
;
++
j
)
{
{
const
index_t
aindex
=
a_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
index_t
aindex
=
a_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
index_t
bindex
=
b_mtx
.
Get1dIndex
(
k
,
j
);
const
index_t
bindex
=
b_mtx
.
Get1dIndex
(
k
,
j
);
...
...
src/include/threadwise_
nd_
tensor_op.hip.hpp
→
src/include/threadwise_tensor_
slice_
op.hip.hpp
View file @
b7d05245
...
@@ -3,18 +3,18 @@
...
@@ -3,18 +3,18 @@
// need to assume src and dst is aligned
// need to assume src and dst is aligned
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
index_t
DataPerRead
>
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
index_t
DataPerRead
>
__device__
void
threadwise_
nd_
tensor_copy
(
SrcDesc
,
__device__
void
threadwise_tensor_
slice_
copy
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
const
Float
*
__restrict__
p_src
,
DstDesc
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
SrcOpLengths
,
Number
<
DataPerRead
>
)
Number
<
DataPerRead
>
)
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryType
;
using
vector_t
=
typename
vector_type
<
Float
,
DataPerRead
>::
MemoryType
;
constexpr
index_t
nDim
=
SrcOpLengths
::
GetSize
();
constexpr
index_t
nDim
=
SrcOpLengths
::
GetSize
();
static_assert
(
SrcDesc
{}.
GetDimension
()
==
nDim
&&
DstDesc
{}.
GetDimension
()
==
nDim
,
static_assert
(
SrcDesc
{}.
Get
NumOf
Dimension
()
==
nDim
&&
DstDesc
{}.
Get
NumOf
Dimension
()
==
nDim
,
"wrong! dimension not consistent"
);
"wrong! dimension not consistent"
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
...
@@ -63,7 +63,7 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
...
@@ -63,7 +63,7 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
});
});
}
}
//
write
in order of src
//
access
in order of src
template
<
class
SrcData
,
template
<
class
SrcData
,
class
DstData
,
class
DstData
,
class
SrcDesc
,
class
SrcDesc
,
...
@@ -71,12 +71,12 @@ template <class SrcData,
...
@@ -71,12 +71,12 @@ template <class SrcData,
class
SrcOpLengths
,
class
SrcOpLengths
,
class
MapDst2Src
>
class
MapDst2Src
>
__device__
void
__device__
void
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v1
(
SrcDesc
,
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v1
(
SrcDesc
,
const
SrcData
*
__restrict__
p_src
,
const
SrcData
*
__restrict__
p_src
,
DstDesc
,
DstDesc
,
DstData
*
__restrict__
p_dst
,
DstData
*
__restrict__
p_dst
,
SrcOpLengths
,
SrcOpLengths
,
MapDst2Src
)
MapDst2Src
)
{
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
...
@@ -92,7 +92,7 @@ threadwise_nd_tensor_copy_reorder_given_dst2src_v1(SrcDesc,
...
@@ -92,7 +92,7 @@ threadwise_nd_tensor_copy_reorder_given_dst2src_v1(SrcDesc,
});
});
}
}
//
write
in order of dst
//
access
in order of dst
template
<
class
SrcData
,
template
<
class
SrcData
,
class
DstData
,
class
DstData
,
class
SrcDesc
,
class
SrcDesc
,
...
@@ -100,12 +100,12 @@ template <class SrcData,
...
@@ -100,12 +100,12 @@ template <class SrcData,
class
SrcOpLengths
,
class
SrcOpLengths
,
class
MapDst2Src
>
class
MapDst2Src
>
__device__
void
__device__
void
threadwise_
nd_
tensor_copy_reorder_given_dst2src_v2
(
SrcDesc
,
threadwise_tensor_
slice_
copy_reorder_given_dst2src_v2
(
SrcDesc
,
const
SrcData
*
__restrict__
p_src
,
const
SrcData
*
__restrict__
p_src
,
DstDesc
,
DstDesc
,
DstData
*
__restrict__
p_dst
,
DstData
*
__restrict__
p_dst
,
SrcOpLengths
,
SrcOpLengths
,
MapDst2Src
)
MapDst2Src
)
{
{
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
...
@@ -123,20 +123,22 @@ threadwise_nd_tensor_copy_reorder_given_dst2src_v2(SrcDesc,
...
@@ -123,20 +123,22 @@ threadwise_nd_tensor_copy_reorder_given_dst2src_v2(SrcDesc,
});
});
}
}
// write in order of dst
// access in order of dst
// manually pack data into vector before write
template
<
class
Float
,
template
<
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
SrcOpLengths
,
class
MapDst2Src
,
class
MapDst2Src
,
index_t
DstDataPerWrite
>
index_t
DstDataPerWrite
>
__device__
void
threadwise_nd_tensor_copy_reorder_given_dst2src_v3
(
SrcDesc
,
__device__
void
const
Float
*
__restrict__
p_src
,
threadwise_tensor_slice_copy_reorder_given_dst2src_v3
(
SrcDesc
,
DstDesc
,
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
,
DstDesc
,
SrcOpLengths
,
Float
*
__restrict__
p_dst
,
MapDst2Src
,
SrcOpLengths
,
Number
<
DstDataPerWrite
>
)
MapDst2Src
,
Number
<
DstDataPerWrite
>
)
{
{
using
vector_t
=
typename
vector_type
<
Float
,
DstDataPerWrite
>::
MemoryType
;
using
vector_t
=
typename
vector_type
<
Float
,
DstDataPerWrite
>::
MemoryType
;
...
@@ -190,3 +192,17 @@ __device__ void threadwise_nd_tensor_copy_reorder_given_dst2src_v3(SrcDesc,
...
@@ -190,3 +192,17 @@ __device__ void threadwise_nd_tensor_copy_reorder_given_dst2src_v3(SrcDesc,
});
});
});
});
}
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
>
__device__
void
threadwise_tensor_slice_copy_generic
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
Array
<
index_t
,
SrcDesc
::
GetNumOfDimension
()
>
src_multi_offset
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Array
<
index_t
,
DstDesc
::
GetNumOfDimension
()
>
dst_multi_offset
,
SliceLengths
,
DimAccessOrder
)
{
// not implemented
}
src/tensor.cpp
View file @
b7d05245
...
@@ -25,7 +25,7 @@ void TensorDescriptor::CalculateStrides()
...
@@ -25,7 +25,7 @@ void TensorDescriptor::CalculateStrides()
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
}
}
std
::
size_t
TensorDescriptor
::
GetDimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
TensorDescriptor
::
Get
NumOf
Dimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
TensorDescriptor
::
GetElementSize
()
const
std
::
size_t
TensorDescriptor
::
GetElementSize
()
const
{
{
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment