Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
917d7a2b
Commit
917d7a2b
authored
Jun 03, 2019
by
Chao Liu
Browse files
use vectorized read and write for threadwise generic tensor copy
parent
33c38e2a
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
177 additions
and
123 deletions
+177
-123
src/include/blockwise_generic_tensor_slice_op.hip.hpp
src/include/blockwise_generic_tensor_slice_op.hip.hpp
+19
-17
src/include/common.hip.hpp
src/include/common.hip.hpp
+9
-0
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+7
-5
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+1
-1
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+1
-1
src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+9
-8
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+9
-8
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
...implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+13
-19
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
+10
-9
src/include/threadwise_generic_tensor_slice_op.hip.hpp
src/include/threadwise_generic_tensor_slice_op.hip.hpp
+99
-0
src/include/threadwise_tensor_slice_op.hip.hpp
src/include/threadwise_tensor_slice_op.hip.hpp
+0
-55
No files found.
src/include/blockwise_generic_tensor_slice_op.hip.hpp
View file @
917d7a2b
...
...
@@ -15,7 +15,7 @@ template <index_t BlockSize,
class
SrcAccessOrder
,
class
DstAccessOrder
,
index_t
SrcDataPerRead
,
index_t
DstDataPer
Read
>
index_t
DstDataPer
Write
>
struct
BlockwiseGenericTensorSliceCopy_v1
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
...
...
@@ -217,14 +217,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
const
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_id_begin
);
// cannot not constexpr, why?
threadwise_generic_tensor_slice_copy
(
SrcDesc
{},
threadwise_generic_tensor_slice_copy
_v1
(
SrcDesc
{},
p_src
+
src_offset
+
mThreadSrcOffset
,
make_zero_array
<
index_t
,
nDim
>
(),
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
thread_sub_tensor_lengths
,
SrcAccessOrder
{});
SrcAccessOrder
{},
Number
<
SrcDataPerRead
>
{});
});
}
...
...
@@ -255,14 +256,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
const
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
// cannot not constexpr, why?
threadwise_generic_tensor_slice_copy
(
thread_tensor_desc
,
threadwise_generic_tensor_slice_copy
_v1
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
DstDesc
{},
p_dst
+
dst_offset
+
mThreadDstOffset
,
make_zero_array
<
index_t
,
nDim
>
(),
thread_sub_tensor_lengths
,
DstAccessOrder
{});
DstAccessOrder
{},
Number
<
DstDataPerWrite
>
{});
});
}
...
...
src/include/common.hip.hpp
View file @
917d7a2b
...
...
@@ -105,4 +105,13 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
return
x
<
y
?
x
:
y
;
}
// this is wrong
// TODO: implement correct least common multiple, instead of calling max()
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
least_common_multiple
(
T
x
,
Ts
...
xs
)
{
return
max
(
x
,
xs
...);
}
}
// namespace mod_conv
src/include/conv_common.hip.hpp
View file @
917d7a2b
...
...
@@ -3,8 +3,7 @@
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
...
...
@@ -34,8 +33,10 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
}
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
constexpr
auto
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
,
LowerPads
,
UpperPads
)
constexpr
auto
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
,
LowerPads
,
UpperPads
)
{
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
...
...
@@ -113,5 +114,6 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc,
constexpr
index_t
Y
=
wei_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_desc
.
GetLength
(
I3
);
return
sizeof
(
Float
)
*
(
InDesc
::
GetElementSpace
()
+
WeiDesc
::
GetElementSpace
()
+
OutDesc
::
GetElementSpace
());
return
sizeof
(
Float
)
*
(
InDesc
::
GetElementSpace
()
+
WeiDesc
::
GetElementSpace
()
+
OutDesc
::
GetElementSpace
());
}
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
917d7a2b
...
...
@@ -474,7 +474,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
map_out_global2thread,
Number<OutThreadCopyDataPerWrite_W>{});
#else
threadwise_generic_tensor_slice_copy
(
threadwise_generic_tensor_slice_copy
_v1
(
out_10d_thread_desc
.
ReorderGivenNew2Old
(
map_out_global2thread
),
p_out_thread
,
make_zero_array
<
index_t
,
10
>
(),
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
View file @
917d7a2b
...
...
@@ -423,7 +423,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
map_out_global2thread,
Number<OutThreadCopyDataPerWrite_W>{});
#else
threadwise_generic_tensor_slice_copy
(
threadwise_generic_tensor_slice_copy
_v1
(
out_10d_thread_desc
.
ReorderGivenNew2Old
(
map_out_global2thread
),
p_out_thread
,
make_zero_array
<
index_t
,
10
>
(),
...
...
src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
917d7a2b
...
...
@@ -405,7 +405,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
threadwise_generic_tensor_slice_copy
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
p_out_thread
,
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
View file @
917d7a2b
...
...
@@ -358,7 +358,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
threadwise_generic_tensor_slice_copy
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
p_out_thread
,
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
,
...
...
src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
View file @
917d7a2b
...
...
@@ -5,7 +5,7 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_generic_tensor_slice_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
#include "threadwise_tensor_slice_op.hip.hpp"
#include "threadwise_
generic_
tensor_slice_op.hip.hpp"
// define B = merge(N, Ho, Wo)
template
<
index_t
GridSize
,
...
...
@@ -170,7 +170,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
#if
0
#if
1
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
Float
,
...
...
@@ -188,7 +188,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
#else
constexpr
auto
map_k_e_2_e_k
=
Sequence
<
1
,
0
>
{};
const
auto
blockwise_wei_copy
=
BlockwiseTensorSliceReorderCopy_v3
<
auto
blockwise_wei_copy
=
BlockwiseTensorSliceReorderCopy_v3
<
BlockSize
,
Float
,
decltype
(
wei_e_k_global_desc
.
ReorderGivenNew2Old
(
map_k_e_2_e_k
)),
...
...
@@ -324,11 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
#if 0
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
#endif
__syncthreads
();
...
...
@@ -355,11 +351,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// even iteration
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
#if 0
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
#endif
__syncthreads
();
...
...
@@ -434,14 +426,16 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
threadwise_generic_tensor_slice_copy
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
p_out_thread
,
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
,
p_out_thread_on_global
,
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
(),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
SeqType
{});
arithmetic_sequence_gen
<
0
,
8
,
1
>::
SeqType
{},
Number
<
1
>
{});
}
}
};
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
View file @
917d7a2b
...
...
@@ -5,7 +5,7 @@
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_generic_tensor_slice_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
#include "threadwise_tensor_slice_op.hip.hpp"
#include "threadwise_
generic_
tensor_slice_op.hip.hpp"
// define B = merge(N, Ho, Wo)
template
<
index_t
GridSize
,
...
...
@@ -327,7 +327,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
threadwise_generic_tensor_slice_copy
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
p_out_thread
,
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
,
...
...
src/include/threadwise_generic_tensor_slice_op.hip.hpp
0 → 100644
View file @
917d7a2b
#pragma once
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMergedTensorDescriptor.hip.hpp"
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
,
index_t
DataPerAccess
>
__device__
void
threadwise_generic_tensor_slice_copy_v1
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
Array
<
index_t
,
SrcDesc
::
GetNumOfDimension
()
>
src_multi_id_begin
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Array
<
index_t
,
DstDesc
::
GetNumOfDimension
()
>
dst_multi_id_begin
,
SliceLengths
,
DimAccessOrder
,
Number
<
DataPerAccess
>
)
{
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
#if 0
// doesn't compile, because merged-tensor reordering is not implemented
// TODO: implement tensor desc ops for merged-tensor
constexpr auto src_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
constexpr auto dst_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
// check src/dst stride on the lowest access dimension
static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
(DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
"wrong! src/dst stride on the lowest access dimension needs to be 1 for "
"vectorized read/write");
#endif
constexpr
auto
slice_lengths_in_access_order
=
SliceLengths
::
ReorderGivenNew2Old
(
DimAccessOrder
{});
// check slice length on the lowest access dimension
static_assert
(
slice_lengths_in_access_order
.
Back
()
%
DataPerAccess
==
0
,
"wrong! slice length on the lowest access dimension should be evenly divided by "
"DataPerAccess"
);
constexpr
index_t
num_access_on_lowest_access_dimension
=
slice_lengths_in_access_order
.
Back
()
/
DataPerAccess
;
constexpr
auto
access_lengths
=
slice_lengths_in_access_order
.
Modify
(
Number
<
nDim
-
1
>
{},
Number
<
num_access_on_lowest_access_dimension
>
{});
using
vector_t
=
typename
vector_type
<
Float
,
DataPerAccess
>::
MemoryType
;
#if 1
ford
<
decltype
(
access_lengths
)
>
{}([
&
](
auto
access_multi_id
)
{
auto
data_multi_id_in_access_order
=
access_multi_id
;
data_multi_id_in_access_order
[
nDim
-
1
]
=
access_multi_id
[
nDim
-
1
]
*
DataPerAccess
;
const
auto
data_multi_id
=
reorder_array_given_old2new
(
data_multi_id_in_access_order
,
DimAccessOrder
{});
const
index_t
src_index
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_multi_id_begin
+
data_multi_id
);
const
index_t
dst_index
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_multi_id_begin
+
data_multi_id
);
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]);
});
#else
static_ford
<
decltype
(
access_lengths
)
>
{}([
&
](
auto
access_multi_id_
)
{
const
auto
access_multi_id
=
sequence2array
(
access_multi_id_
);
auto
data_multi_id_in_access_order
=
access_multi_id
;
data_multi_id_in_access_order
[
nDim
-
1
]
=
access_multi_id
[
nDim
-
1
]
*
DataPerAccess
;
const
auto
data_multi_id
=
reorder_array_given_old2new
(
data_multi_id_in_access_order
,
DimAccessOrder
{});
const
index_t
src_index
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_multi_id_begin
+
data_multi_id
);
const
index_t
dst_index
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_multi_id_begin
+
data_multi_id
);
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]);
});
#endif
}
src/include/threadwise_tensor_slice_op.hip.hpp
View file @
917d7a2b
...
...
@@ -192,58 +192,3 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
});
});
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
>
__device__
void
threadwise_generic_tensor_slice_copy
(
SrcDesc
,
const
Float
*
__restrict__
p_src
,
Array
<
index_t
,
SrcDesc
::
GetNumOfDimension
()
>
src_multi_id_begin
,
DstDesc
,
Float
*
__restrict__
p_dst
,
Array
<
index_t
,
DstDesc
::
GetNumOfDimension
()
>
dst_multi_id_begin
,
SliceLengths
,
DimAccessOrder
)
{
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
constexpr
auto
slice_lengths_in_access_order
=
SliceLengths
::
ReorderGivenNew2Old
(
DimAccessOrder
{});
#if 1
ford
<
decltype
(
slice_lengths_in_access_order
)
>
{}([
&
](
auto
data_multi_id_in_access_order
)
{
const
auto
data_multi_id
=
reorder_array_given_old2new
(
data_multi_id_in_access_order
,
DimAccessOrder
{});
const
index_t
src_index
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_multi_id_begin
+
data_multi_id
);
const
index_t
dst_index
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_multi_id_begin
+
data_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
});
#else
static_ford
<
decltype
(
slice_lengths_in_access_order
)
>
{}(
[
&
](
auto
data_multi_id_in_access_order_
)
{
constexpr
auto
data_multi_id_in_access_order
=
sequence2array
(
decltype
(
data_multi_id_in_access_order_
){});
const
auto
data_multi_id
=
reorder_array_given_old2new
(
data_multi_id_in_access_order
,
DimAccessOrder
{});
const
index_t
src_index
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_multi_id_begin
+
data_multi_id
);
const
index_t
dst_index
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_multi_id_begin
+
data_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
});
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment