Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
724e984b
Commit
724e984b
authored
Sep 11, 2019
by
Chao Liu
Browse files
enabling padding for chwn format
parent
ca42e910
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
810 additions
and
326 deletions
+810
-326
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
..._convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+169
-200
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+56
-9
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+98
-44
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+69
-9
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+143
-20
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+160
-7
composable_kernel/include/utility/array.hpp
composable_kernel/include/utility/array.hpp
+3
-14
composable_kernel/include/utility/array_helper.hpp
composable_kernel/include/utility/array_helper.hpp
+86
-2
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+9
-2
composable_kernel/include/utility/sequence.hpp
composable_kernel/include/utility/sequence.hpp
+2
-4
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
...ce_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+14
-14
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
View file @
724e984b
...
...
@@ -18,8 +18,8 @@ template <index_t GridSize,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
class
L
ower
Pads
,
class
Upper
Pads
,
class
L
eft
Pads
,
class
Right
Pads
,
index_t
NPerBlock
,
index_t
KPerBlock
,
index_t
CPerBlock
,
...
...
@@ -60,7 +60,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
static
constexpr
auto
I10
=
Number
<
10
>
{};
static
constexpr
auto
I11
=
Number
<
11
>
{};
#if 0
static
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
static
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
#if 1
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
...
...
@@ -73,14 +76,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
GemmNPerThreadSubC
%
NPerThread
==
0
)),
"wrong!"
);
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
constexpr
auto
in_c_h_w_n_global_desc_old
=
InGlobalDesc
{};
constexpr
auto
wei_c_y_x_k_global_desc_old
=
WeiGlobalDesc
{};
constexpr
auto
out_k_h_w_n_global_desc_old
=
OutGlobalDesc
{};
constexpr
auto
in_c_h_w_n_global_desc
=
make_native_tensor_descriptor
(
in_c_h_w_n_global_desc_old
.
GetLengths
(),
in_c_h_w_n_global_desc_old
.
GetStrides
());
constexpr auto in_c_h_w_n_global_desc = InGlobalDesc{};
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{};
constexpr
auto
wei_c_y_x_k_global_desc
=
make_native_tensor_descriptor
(
wei_c_y_x_k_global_desc_old
.
GetLengths
(),
wei_c_y_x_k_global_desc_old
.
GetStrides
());
constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
constexpr
auto
out_k_h_w_n_global_desc
=
make_native_tensor_descriptor
(
out_k_h_w_n_global_desc_old
.
GetLengths
(),
out_k_h_w_n_global_desc_old
.
GetStrides
());
constexpr
index_t
C
=
in_c_h_w_n_global_desc
.
GetLength
(
I0
);
constexpr
index_t
Hi
=
in_c_h_w_n_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Wi
=
in_c_h_w_n_global_desc
.
GetLength
(
I2
);
constexpr
index_t
K
=
out_k_h_w_n_global_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_k_h_w_n_global_desc
.
GetLength
(
I1
);
...
...
@@ -111,11 +122,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
const
index_t
wo_block_data_begin
=
block_work_multi_id
[
2
]
*
WoPerBlock
;
const
index_t
n_block_data_begin
=
block_work_multi_id
[
3
]
*
NPerBlock
;
const index_t hi_block_data_begin = ho_block_data_begin;
const index_t wi_block_data_begin = wo_block_data_begin;
const
index_t
hi_block_data_begin
=
ho_block_data_begin
-
LeftPads
{}[
0
];
const
index_t
wi_block_data_begin
=
wo_block_data_begin
-
LeftPads
{}[
1
];
// input global tensor view
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
in_c_h_w_n_global_desc
,
make_tuple
(
PassThrough
<
C
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{},
PassThrough
<
N
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
// global tensor view
constexpr auto wei_c_k_global_desc = wei_c_y_x_k_global_desc.Extract(I0, I3);
constexpr
auto
wei_c_k_global_desc_old
=
wei_c_y_x_k_global_desc_old
.
Extract
(
I0
,
I3
);
constexpr
auto
wei_c_k_global_desc
=
make_native_tensor_descriptor
(
wei_c_k_global_desc_old
.
GetLengths
(),
wei_c_k_global_desc_old
.
GetStrides
());
// LDS tensor view
// be careful of alignment
...
...
@@ -124,122 +146,81 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
GemmDataPerReadA
,
GemmDataPerReadB
);
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
constexpr
auto
in_c_h_w_n_block_desc
_old
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
max_align
>
{});
// hack
constexpr
auto
in_c_h_w_n_block_desc
=
make_native_tensor_descriptor
(
in_c_h_w_n_block_desc_old
.
GetLengths
(),
in_c_h_w_n_block_desc_old
.
GetStrides
());
// this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
constexpr
auto
wei_c_k_block_desc
_old
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
max_align
>
{});
constexpr auto wei_c_
1_1_
k_block_desc = make_
ConstantT
ensor
D
escriptor
_aligned
(
Sequence<CPerBlock, 1, 1, KPerBlock>{}, Number<max_align>{}
);
constexpr
auto
wei_c_k_block_desc
=
make_
native_t
ensor
_d
escriptor
(
wei_c_k_block_desc_old
.
GetLengths
(),
wei_c_k_block_desc_old
.
GetStrides
()
);
// LDS: be careful of alignment
constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace();
constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace();
constexpr
index_t
in_block_space
=
in_c_h_w_n_block_desc
_old
.
GetElementSpace
();
constexpr
index_t
wei_block_space
=
wei_c_k_block_desc
_old
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
constexpr
auto
out_k_h_w_n_thread_desc
_old
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
#if 1
constexpr
auto
out_k_h_w_n_thread_desc
=
make_native_tensor_descriptor
(
out_k_h_w_n_thread_desc_old
.
GetLengths
(),
out_k_h_w_n_thread_desc_old
.
GetStrides
());
// blockwise input copy
// format is [C, Hi, Wi, N]
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v2<BlockSize,
decltype(in_c_h_w_n_global_desc),
decltype(in_c_h_w_n_block_desc),
decltype(in_c_h_w_n_block_desc.GetLengths()),
InBlockCopySubLengths_CHWN,
InBlockCopyClusterLengths_CHWN,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
3,
3,
InBlockCopyDataPerAccess_N,
InBlockCopyDataPerAccess_N>({0, 0, 0, 0},
{0, 0, 0, 0});
#if 0
BlockwiseGenericTensorSliceCopy_v2
#else
auto in_c_h_w_n_global = make_TensorView(in_c_h_w_n_global_desc, p_in_global);
auto in_c_h_w_n_block = make_TensorView(in_c_h_w_n_block_desc, p_in_block);
auto blockwise_in_copy =
BlockwiseGenericTensorSliceCopy_v3<BlockSize,
decltype(in_c_h_w_n_global),
decltype(in_c_h_w_n_block),
decltype(in_c_h_w_n_block.GetLengths()),
InBlockCopySubLengths_CHWN,
InBlockCopyClusterLengths_CHWN,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
3,
3,
InBlockCopyDataPerAccess_N,
InBlockCopyDataPerAccess_N>(
in_c_h_w_n_global,
{0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin},
in_c_h_w_n_block,
{0, 0, 0, 0});
BlockwiseGenericTensorSliceCopy_v4
#endif
<
BlockSize
,
decltype
(
in_c_hp_wp_n_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
InBlockCopySubLengths_CHWN
,
InBlockCopyClusterLengths_CHWN
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
InBlockCopyDataPerAccess_N
,
InBlockCopyDataPerAccess_N
>
({
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
});
#if 1
// blockwise wei copy
// format is [CPerBlock, KPerBlock]
const
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_CK
,
WeiBlockCopyClusterLengths_CK
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
WeiBlockCopyDataPerAccess_K
,
WeiBlockCopyDataPerAccess_K
>
({
0
,
0
},
{
0
,
0
});
#if 0
BlockwiseGenericTensorSliceCopy_v2
#else
auto
wei_c_y_x_k_global
=
make_TensorView
(
wei_c_y_x_k_global_desc
,
p_wei_global
);
auto
wei_c_1_1_k_block
=
make_TensorView
(
wei_c_1_1_k_block_desc
,
p_wei_block
);
constexpr
index_t
WeiBlockCopySubLengths_C
=
WeiBlockCopySubLengths_CK
{}[
0
];
constexpr
index_t
WeiBlockCopySubLengths_K
=
WeiBlockCopySubLengths_CK
{}[
1
];
using
WeiBlockCopySubLengths_CYXK
=
Sequence
<
WeiBlockCopySubLengths_C
,
1
,
1
,
WeiBlockCopySubLengths_K
>
;
constexpr
index_t
WeiBlockCopyClusterLengths_C
=
WeiBlockCopyClusterLengths_CK
{}[
0
];
constexpr
index_t
WeiBlockCopyClusterLengths_K
=
WeiBlockCopyClusterLengths_CK
{}[
1
];
using
WeiBlockCopyClusterLengths_CYXK
=
Sequence
<
WeiBlockCopyClusterLengths_C
,
1
,
1
,
WeiBlockCopyClusterLengths_K
>
;
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v3
<
BlockSize
,
decltype
(
wei_c_y_x_k_global
),
decltype
(
wei_c_1_1_k_block
),
decltype
(
wei_c_1_1_k_block
.
GetLengths
()),
WeiBlockCopySubLengths_CYXK
,
WeiBlockCopyClusterLengths_CYXK
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
WeiBlockCopyDataPerAccess_K
,
WeiBlockCopyDataPerAccess_K
>
(
wei_c_y_x_k_global
,
{
0
,
0
,
0
,
k_block_data_begin
},
wei_c_1_1_k_block
,
{
0
,
0
,
0
,
0
});
BlockwiseGenericTensorSliceCopy_v4
#endif
<
BlockSize
,
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_CK
,
WeiBlockCopyClusterLengths_CK
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
WeiBlockCopyDataPerAccess_K
,
WeiBlockCopyDataPerAccess_K
>
({
0
,
0
},
{
0
,
0
});
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
...
...
@@ -283,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// register
// C++ lambda doesn't capture array, use pointer instead
Float
p_out_thread_data
[
out_k_h_w_n_thread_desc
.
GetElementSpace
()];
Float
p_out_thread_data
[
out_k_h_w_n_thread_desc
_old
.
GetElementSpace
()];
Float
*
const
p_out_thread
=
p_out_thread_data
;
// set threadwise output tensor to 0
...
...
@@ -296,12 +277,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_c_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
,
n_block_data_begin
);
in_c_h_w_n_global_desc
.
CalculateOffset
(
{
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
,
n_block_data_begin
}
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
k_block_data_begin
);
wei_c_y_x_k_global_desc
.
CalculateOffset
({
0
,
y
,
x
,
k_block_data_begin
}
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
...
...
@@ -390,46 +371,38 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
auto
out_10d_global_desc
=
fwd
(
out_k_h_w_n_global_desc
)
.
Fold
(
I3
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
W2
>
{})
.
Fold
(
I0
,
Number
<
K1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_global_desc
_old
=
fwd
(
out_k_h_w_n_global_desc
_old
)
.
Fold
(
I3
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
W2
>
{})
.
Fold
(
I0
,
Number
<
K1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_thread_desc
=
fwd
(
out_k_h_w_n_thread_desc
)
.
Fold
(
I3
,
Number
<
1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_global_desc
=
make_native_tensor_descriptor
(
out_10d_global_desc_old
.
GetLengths
(),
out_10d_global_desc_old
.
GetStrides
());
Float
*
p_out_thread_on_global
=
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
);
constexpr
auto
out_10d_thread_desc_old
=
fwd
(
out_k_h_w_n_thread_desc_old
)
.
Fold
(
I3
,
Number
<
1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
#if 1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
out_10d_thread_desc
),
decltype
(
out_10d_global_desc
),
decltype
(
out_10d_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
10
,
1
>::
type
,
9
,
OutThreadCopyDataPerAccess_N
,
OutThreadCopyDataPerAccess_N
>
(
make_zero_array
<
index_t
,
10
>
(),
make_zero_array
<
index_t
,
10
>
())
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#elif 0
ThreadwiseGenericTensorSliceCopy_v1r1
<
decltype
(
out_10d_thread_desc
),
constexpr
auto
out_10d_thread_desc
=
make_native_tensor_descriptor
(
out_10d_thread_desc_old
.
GetLengths
(),
out_10d_thread_desc_old
.
GetStrides
());
Float
*
p_out_thread_on_global
=
p_out_global
+
out_k_h_w_n_global_desc
.
CalculateOffset
({
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
});
ThreadwiseGenericTensorSliceCopy_v4r2
<
decltype
(
out_10d_thread_desc
),
decltype
(
out_10d_global_desc
),
decltype
(
out_10d_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
10
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
10
,
1
>::
type
,
9
,
9
,
OutThreadCopyDataPerAccess_N
,
OutThreadCopyDataPerAccess_N
>
(
make_zero_array
<
index_t
,
10
>
(),
make_zero_array
<
index_t
,
10
>
())
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#endif
}).
Else
([
&
](
auto
fwd
)
{
static_assert
(
fwd
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
GemmNPerThreadSubC
%
NPerThread
==
0
,
...
...
@@ -445,87 +418,47 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
auto
out_10d_global_desc
=
fwd
(
out_k_h_w_n_global_desc
)
constexpr
auto
out_10d_global_desc
_old
=
fwd
(
out_k_h_w_n_global_desc
_old
)
.
Fold
(
I3
,
Number
<
N1
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
W2
>
{},
Number
<
W3
>
{})
.
Fold
(
I0
,
Number
<
K1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_thread_desc
=
fwd
(
out_k_h_w_n_thread_desc
)
constexpr
auto
out_10d_global_desc
=
make_native_tensor_descriptor
(
out_10d_global_desc_old
.
GetLengths
(),
out_10d_global_desc_old
.
GetStrides
());
constexpr
auto
out_10d_thread_desc_old
=
fwd
(
out_k_h_w_n_thread_desc_old
)
.
Fold
(
I3
,
Number
<
N1
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{},
Number
<
W3
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
Float
*
p_out_thread_on_global
=
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
);
constexpr
auto
out_10d_thread_desc
=
make_native_tensor_descriptor
(
out_10d_thread_desc_old
.
GetLengths
(
0
),
out_10d_thread_desc_old
.
GetStrides
());
#if 1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
out_10d_thread_desc
),
decltype
(
out_10d_global_desc
),
decltype
(
out_10d_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
10
,
1
>::
type
,
9
,
OutThreadCopyDataPerAccess_N
,
OutThreadCopyDataPerAccess_N
>
(
make_zero_array
<
index_t
,
10
>
(),
make_zero_array
<
index_t
,
10
>
())
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#elif 0
ThreadwiseGenericTensorSliceCopy_v1r1
<
decltype
(
out_10d_thread_desc
),
Float
*
p_out_thread_on_global
=
p_out_global
+
out_k_h_w_n_global_desc
.
CalculateOffset
({
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
});
ThreadwiseGenericTensorSliceCopy_v4r2
<
decltype
(
out_10d_thread_desc
),
decltype
(
out_10d_global_desc
),
decltype
(
out_10d_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
10
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
10
,
1
>::
type
,
9
,
9
,
OutThreadCopyDataPerAccess_N
,
OutThreadCopyDataPerAccess_N
>
(
make_zero_array
<
index_t
,
10
>
(),
make_zero_array
<
index_t
,
10
>
())
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#endif
});
}
#el
se
#el
if 0
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
#if 0
constexpr auto a = make_tuple(true, Sequence<1>{}, index_t(99));
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
printf("[0] %d\n", a.At(I0));
print_Sequence("[1]", a.At(I1));
printf("[2] %lu\n", a.At(I2));
}
bool flag = true;
auto b = make_tuple(flag, Sequence<1>{}, 99);
b.At(I0) = false;
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
printf("[0] %d\n", b.At(I0));
print_Sequence("[1]", b.At(I1));
printf("[2] %lu\n", b.At(I2));
printf("flag %d\n", flag);
}
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
printf("[0] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I0));
print_Sequence("[1]", make_tuple(true, Sequence<1>(), index_t(99)).At(I1));
printf("[2] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I2));
}
#elif
1
// create a native tensor descriptor
constexpr
auto
in_c_h_w_n_global_desc
=
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
...
...
@@ -540,11 +473,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
constexpr
auto
in_n_c_hp_wp_global_desc
=
transform_tensor_descriptor
(
in_c_h_w_n_global_desc
,
make_tuple
(
Pad
<
Sequence
<
Hi
,
Wi
>
,
L
ower
Pads
,
Upper
Pads
>
{},
PassThrough
<
C
>
{},
PassThrough
<
N
>
{}),
Pad
<
Sequence
<
Hi
,
Wi
>
,
L
eft
Pads
,
Right
Pads
>
{},
PassThrough
<
C
>
{},
PassThrough
<
N
>
{}),
make_tuple
(
Sequence
<
1
,
2
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
2
,
3
>
{},
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
#if 1
// transformation: {n, c, hp, wp} --> {c, b}
// {n, hp, wp} --> {b}, {c} --> {c}
constexpr
auto
in_c_b_global_desc
=
transform_tensor_descriptor
(
...
...
@@ -553,9 +485,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
PassThrough
<
in_n_c_hp_wp_global_desc
.
GetLength
(
I1
)
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
3
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
#endif
#if 1
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
// 0
...
...
@@ -577,16 +507,55 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
printf
(
"in_c_b_global_desc offset: %lu
\n
"
,
in_c_b_global_desc
.
CalculateOffset
(
idx2
));
}
}
#else
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
// create a native tensor descriptor
constexpr
auto
in_c_h_w_n_global_desc
=
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
constexpr
index_t
C
=
in_c_h_w_n_global_desc
.
GetLength
(
I0
);
constexpr
index_t
Hi
=
in_c_h_w_n_global_desc
.
GetLength
(
I1
);
constexpr
index_t
Wi
=
in_c_h_w_n_global_desc
.
GetLength
(
I2
);
constexpr
index_t
N
=
in_c_h_w_n_global_desc
.
GetLength
(
I3
);
// transformation: {c, h, w, n} --> {n, c, hp, wp}
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
constexpr
auto
in_c_hp_wp_n_global_desc
=
transform_tensor_descriptor
(
in_c_h_w_n_global_desc
,
make_tuple
(
PassThrough
<
C
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
LeftPads
,
RightPads
>
{},
PassThrough
<
N
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
index_t
c
=
static_cast
<
index_t
>
(
threadIdx
.
x
);
index_t
h
=
static_cast
<
index_t
>
(
threadIdx
.
y
);
index_t
w
=
static_cast
<
index_t
>
(
threadIdx
.
z
);
// 0
print_tensor_descriptor
(
"in_c_h_w_n_global_desc"
,
in_c_h_w_n_global_desc
);
// 1
print_tensor_descriptor
(
"in_c_hp_wp_n_global_desc"
,
in_c_hp_wp_n_global_desc
);
p_out_global
[
0
]
=
in_n_c_h_w_padded_global_desc
.
CalculateOffset
({
1
,
c
,
h
,
w
});
constexpr
auto
idx1
=
MultiIndex
<
4
>
{
1
,
2
,
3
,
4
};
auto
idx0
=
in_c_hp_wp_n_global_desc
.
CalculateLowerIndex
(
idx1
);
print_array
(
"idx1: "
,
idx1
);
print_array
(
"idx0: "
,
idx0
);
auto
coord1
=
make_tensor_coordinate_v2
(
in_c_hp_wp_n_global_desc
,
idx1
);
print_array
(
"1: "
,
coord1
.
GetIndex
());
print_array
(
"0: "
,
coord1
.
GetLowerCoordinate
().
GetIndex
());
printf
(
"in_c_hp_wp_n_global_desc is_in_pad: %d
\n
"
,
coord1
.
IsAnyLevelIndexInPaddingArea
());
printf
(
"in_c_hp_wp_n_global_desc offset: %lu
\n
"
,
in_c_hp_wp_n_global_desc
.
CalculateOffset
(
idx1
));
}
#endif
#endif
}
#endif
};
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
724e984b
...
...
@@ -22,17 +22,27 @@ struct PassThrough
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
Length
>
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
UpperIndex
idx_up
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
UpperIndex
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
/* idx_up */
)
{
return
false
;
}
};
// LowLengths: Sequence<...>
...
...
@@ -55,17 +65,39 @@ struct Pad
return
GetLowerLengths
()
+
LeftPads
{}
+
RightPads
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
UpperIndex
idx_up
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
return
idx_up
-
LeftPads
{};
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
UpperIndex
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
idx_up_diff
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
idx_up
)
const
{
bool
flag
=
false
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// only check if there is left-padding
static_if
<
(
LeftPads
::
At
(
idim
)
!=
0
)
>
{}(
[
&
](
auto
)
{
flag
=
flag
||
idx_up
[
idim
]
<
LeftPads
::
At
(
idim
);
});
// only check if there is right-padding
static_if
<
(
RightPads
::
At
(
idim
)
!=
0
)
>
{}([
&
](
auto
)
{
flag
=
flag
||
idx_up
[
idim
]
>=
LeftPads
::
At
(
idim
)
+
LowLengths
::
At
(
idim
);
});
});
return
flag
;
}
};
// LowLengths: Sequence<...>
...
...
@@ -124,7 +156,7 @@ struct Merge
.
PushBack
(
Number
<
1
>
{});
// calculate index in each of the dimensions in the order of their dimension
#if 1
#if 1
// would compile to same ISA?
static_for
<
0
,
nDimLow
-
1
,
1
>
{}(
lambda_CalculateLowerIndex
<
decltype
(
pseudo_low_strides
)
>
(
itmp
,
idx_low
));
...
...
@@ -138,8 +170,10 @@ struct Merge
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
LowerIndex
&
idx_low_old
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
idx_low_old
)
{
LowerIndex
idx_low_diff
;
...
...
@@ -149,6 +183,13 @@ struct Merge
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
/* idx_up */
)
{
return
false
;
}
};
// UpLengths: Sequence<...>
...
...
@@ -189,7 +230,10 @@ struct Unmerge
return
idx_low
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
return
CalculateLowerIndex
(
idx_up_diff
);
}
...
...
@@ -240,7 +284,10 @@ struct Embed
return
idx_low
;
}
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
static
constexpr
auto
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_diff
,
const
UpperIndex
&
/* idx_up_old */
,
const
LowerIndex
&
/* idx_low_old */
)
{
LowerIndex
idx_low_diff
{
0
};
...
...
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
View file @
724e984b
...
...
@@ -3,26 +3,28 @@
#include "common_header.hpp"
#include "dimension.hpp"
#include "
dimension
_transform.hpp"
#include "
multi_index
_transform.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
template
<
class
NativeTensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
{
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
using
Index
=
tensor_desc_type
::
Index
;
static
constexpr
index_t
nDim
=
Index
::
GetSize
();
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
NativeTensorCoordinate
(
Index
idx
)
:
m
Offset
{
GetTensorDesriptor
().
Get
Offset
(
idx
)
}
:
m
Index
(
idx
),
mOffset
(
tensor_desc_type
::
Calculate
Offset
(
idx
)
)
{
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
NativeTensorCoordinate
(
Xs
...
xs
)
:
NativeTensorCoordinate
(
Index
{
xs
...})
{
...
...
@@ -36,82 +38,103 @@ struct NativeTensorCoordinate
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
type
operator
+=
(
Index
idx_diff
)
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
mOffset
+=
tensor_desc_type
::
GetOffsetDiff
(
idx_diff
);
// mIndex is updated here, but some (or all) of its entries may never be used
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
CalculateOffset
(
idx_diff
);
return
*
this
;
}
__host__
__device__
type
operator
-=
(
Index
idx_diff
)
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
{
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
idx_diff
);
// mIndex is updated here, but some (or all) of its entries may never be used
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
CalculateOffset
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
Index
idx_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
type
operator
-
(
Index
idx_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
-=
idx_diff
;
return
coord
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsAnyLevelIndexInPaddingArea
()
{
return
false
;
}
private:
// mIndex may be saved and update, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
};
template
<
class
TransformedTensorDesc
>
template
<
typename
TransformedTensorDesc
>
struct
TransformedTensorCoordinate
{
using
type
=
TransformedTensorCoordinate
;
using
tensor_desc_type
=
TransformedTensorDesc
;
using
Index
=
tensor_desc_type
::
UpperIndex
;
using
lower_coordinate_type
=
TensorCoordiante_v2
<
decltype
(
GetTensorDescriptor
().
GetLowerTensorDescriptor
())
>::
type
;
static
constexpr
index_t
nDim
=
Index
::
GetSize
();
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Index
idx
)
:
mIndex
{
idx
},
mCoordLow
{
GetTensorDescriptor
().
GetLowerIndex
(
idx
)}
using
LowerCoord
=
typename
TensorCoordinate_v2
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
UpperCoord
=
TransformedTensorCoordinate
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
UpperIndex
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
TransformedTensorCoordinate
(
UpperIndex
idx
)
:
mIndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
CalculateLowerIndex
(
idx
)}
{
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
:
TransformedTensorCoordinate
(
Index
{
xs
...})
:
TransformedTensorCoordinate
(
Upper
Index
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
TransformedTensorCoordinate
(
Index
{
Xs
...})
:
TransformedTensorCoordinate
(
Upper
Index
{
Xs
...})
{
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mCoordLow
.
GetOffset
();
}
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
__host__
__device__
constexpr
Index
GetIndex
()
const
{
return
m
Index
;
}
__host__
__device__
constexpr
const
Upper
Index
&
GetIndex
()
const
{
return
GetUpper
Index
()
;
}
__host__
__device__
type
operator
+=
(
Index
idx_up_diff
)
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
GetLowerCoordinate
().
GetOffset
();
}
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
tensor_desc_type
::
GetLowerIndexDiff
(
idx_up_diff
,
m
Index
Up
,
mCoordLow
.
GetIndex
());
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
Get
Index
(),
GetLowerCoordinate
()
.
GetIndex
());
// mIndexUp is updated here, but some (or all) of its entries may never be used
mIndexUp
+=
idx_up_diff
;
...
...
@@ -119,11 +142,35 @@ struct TransformedTensorCoordinate
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
Index
idx_up_diff
)
const
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
Upper
Index
&
idx_up_diff
)
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
mIndexUp
-=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
+=
idx_up_diff
;
return
coord_up
;
}
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
-=
idx_up_diff
;
return
coord_up
;
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
constexpr
bool
IsAnyLevelIndexInPaddingArea
()
const
{
return
tensor_desc_type
::
IsUpperIndexInPaddingArea
(
GetIndex
())
||
mCoordLow
.
IsAnyLevelIndexInPaddingArea
();
}
private:
...
...
@@ -131,22 +178,22 @@ struct TransformedTensorCoordinate
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndexUp
;
l
ower
_c
oord
inate_type
mCoordLow
;
Upper
Index
mIndexUp
;
L
ower
C
oord
mCoordLow
;
};
template
<
class
TensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
{
private:
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
();
}
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
...
...
@@ -156,5 +203,12 @@ struct TensorCoordinate_v2
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
template
<
typename
TensorDesc
>
__host__
__device__
constexpr
auto
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
{
return
typename
TensorCoordinate_v2
<
TensorDesc
>::
type
(
idx
);
}
}
#endif
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
724e984b
...
...
@@ -64,6 +64,18 @@ struct NativeTensorDescriptor
return
GetStrides
(
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
accumulate_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
return
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
math
::
plus
<
index_t
>
{},
Number
<
1
>
{});
}
// TODO: this cannot return constepxr because of use of lambda
__host__
__device__
static
constexpr
index_t
CalculateOffset
(
const
Index
&
idx
)
{
index_t
offset
=
0
;
...
...
@@ -73,6 +85,12 @@ struct NativeTensorDescriptor
return
offset
;
}
// TODO: remove this
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
const
Index
&
idx
)
{
return
CalculateOffset
(
idx
);
}
__host__
__device__
static
constexpr
index_t
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
{
index_t
offset_diff
=
0
;
...
...
@@ -100,6 +118,12 @@ struct NativeTensorDescriptor
{
return
Tuple
<>
{};
}
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
Index
&
/* idx */
)
{
return
false
;
}
};
// LowerTensorDescriptor
...
...
@@ -248,6 +272,17 @@ struct TransformedTensorDescriptor
return
GetLengths
(
Sequence
<
IDim
,
IDims
...
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
accumulate_on_sequence
(
GetLengths
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
// TODO: Is this the correct definition for transformed tensor?
return
GetLowerTensorDescriptor
().
GetElementSpace
();
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
...
...
@@ -256,8 +291,8 @@ struct TransformedTensorDescriptor
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
auto
idx_low_part
=
pick_array_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_low_part
=
pick_array_element
(
idx_low
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
...
...
@@ -269,26 +304,29 @@ struct TransformedTensorDescriptor
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexD
iff
(
const
UpperIndex
&
idx_up_
diff
,
const
LowerIndex
&
idx_low_old
)
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndexDiff
(
const
UpperIndex
&
idx_up_d
iff
,
const
UpperIndex
&
idx_up_
old
,
const
LowerIndex
&
idx_low_old
)
{
LowerIndex
idx_low_diff
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
::
At
(
itran
);
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_diff_part
=
pick_array_element
(
idx_up_diff
,
UpDimensionIds
::
At
(
itran
));
pick_array_element
(
idx_up_diff
,
UpDimensionIds
{}.
At
(
itran
));
auto
idx_
low_diff
_part
=
pick_array_element
(
idx_
low_diff
,
Low
DimensionIds
::
At
(
itran
));
const
auto
idx_
up_old
_part
=
pick_array_element
(
idx_
up_old
,
Up
DimensionIds
{}.
At
(
itran
));
const
auto
idx_low_old_part
=
pick_array_element
(
idx_low_old
,
LowDimensionIds
::
At
(
itran
));
pick_array_element
(
idx_low_old
,
LowDimensionIds
{}.
At
(
itran
));
auto
idx_low_diff_part
=
pick_array_element
(
idx_low_diff
,
LowDimensionIds
{}.
At
(
itran
));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part
=
tran
.
CalculateLowerIndex
(
idx_up_diff_part
,
idx_low_old_part
);
idx_low_diff_part
=
tran
.
CalculateLowerIndexDiff
(
to_array
(
idx_up_diff_part
),
to_array
(
idx_up_old_part
),
to_array
(
idx_low_old_part
));
});
return
idx_low_diff
;
...
...
@@ -299,6 +337,12 @@ struct TransformedTensorDescriptor
return
GetLowerTensorDescriptor
().
CalculateOffset
(
CalculateLowerIndex
(
idx_up
));
}
// TODO: remove this
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
const
UpperIndex
&
idx_up
)
{
return
CalculateOffset
(
idx_up
);
}
#if 0
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
...
...
@@ -321,6 +365,22 @@ struct TransformedTensorDescriptor
// not implemented
}
#endif
// TODO: should this function be here? should it be specific for padding check?
__host__
__device__
static
constexpr
bool
IsUpperIndexInPaddingArea
(
const
UpperIndex
&
idx_up
)
{
bool
flag
=
false
;
static_for
<
0
,
nTransform
,
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
tran
=
Transforms
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
UpDimensionIds
{}.
At
(
itran
));
flag
=
flag
||
tran
.
IsUpperIndexInPaddingArea
(
to_array
(
idx_up_part
));
});
return
flag
;
}
};
template
<
index_t
...
Lengths
,
index_t
...
Strides
>
...
...
@@ -337,7 +397,7 @@ __host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths)
Lengths
::
PopFront
(),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
.
PushBack
(
Number
<
1
>
{});
return
make_
N
ative
T
ensor
D
escriptor
(
Lengths
{},
strides
);
return
make_
n
ative
_t
ensor
_d
escriptor
(
Lengths
{},
strides
);
}
template
<
typename
LowTensorDescriptor
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
724e984b
...
...
@@ -7,6 +7,8 @@
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
...
@@ -418,6 +420,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template
<
index_t
BlockSize
,
...
...
@@ -518,7 +521,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
}
private:
using
RegisterBufferDesc
=
decltype
(
make_
ConstantT
ensor
D
escriptor_packed
(
SubLengths
{}));
using
RegisterBufferDesc
=
decltype
(
make_
native_t
ensor
_d
escriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
RegisterBufferDesc
,
...
...
@@ -544,6 +547,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
ThreadwiseStore
mThreadwiseStore
;
};
// this version use TensorView and TensorCoordinate
template
<
index_t
BlockSize
,
class
SrcTensor
,
class
DstTensor
,
...
...
@@ -639,25 +643,25 @@ struct BlockwiseGenericTensorSliceCopy_v3
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferTensor
=
NormalTensorView
<
ThreadBufferDesc
,
data_type
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3
<
SrcTensor
,
ThreadBufferTensor
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3
<
ThreadBufferTensor
,
DstTensor
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3
r1
<
SrcTensor
,
ThreadBufferTensor
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3
r1
<
ThreadBufferTensor
,
DstTensor
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
data_type
mpBuffer
[
ThreadBufferDesc
::
GetElementSpace
()];
...
...
@@ -667,6 +671,125 @@ struct BlockwiseGenericTensorSliceCopy_v3
ThreadwiseStore
mThreadwiseStore
;
};
template
<
index_t
BlockSize
,
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SubLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v4
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
SrcCoord
=
typename
TensorCoordinate_v2
<
SrcDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate_v2
<
DstDesc
>::
type
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v4
(
SrcCoord
src_block_slice_origin
,
DstCoord
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
SubLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
RegisterBufferDesc
::
GetElementSpace
();
}
template
<
class
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
}
template
<
class
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
}
private:
using
RegisterBufferDesc
=
decltype
(
make_native_tensor_descriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
SrcDesc
,
RegisterBufferDesc
,
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
RegisterBufferDesc
,
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
724e984b
...
...
@@ -6,6 +6,8 @@
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
...
...
@@ -427,6 +429,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This version use TensorCoordinate
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
...
...
@@ -774,6 +777,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
DstCoordinate
mDstSliceOrigin
;
};
// this version use TensorView and TensorCoordinate
template
<
class
SrcTensor
,
class
DstTensor
,
class
SliceLengths
,
...
...
@@ -783,7 +787,7 @@ template <class SrcTensor,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v3
struct
ThreadwiseGenericTensorSliceCopy_v3
r1
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
...
...
@@ -791,10 +795,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
r1
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
:
mSrc
{
src
},
mDst
{
dst
},
mSrcSlice
{
src
.
Slice
(
src_slice_origin
,
SliceLengths
{})},
...
...
@@ -821,8 +825,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3
"wrong! vectorized access is not allowed"
);
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
()
:
ThreadwiseGenericTensorSliceCopy_v3
(
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3
r1
()
:
ThreadwiseGenericTensorSliceCopy_v3
r1
(
SrcTensor
{},
SrcCoordinate
{},
DstTensor
{},
DstCoordinate
{})
{
}
...
...
@@ -940,5 +944,154 @@ struct ThreadwiseGenericTensorSliceCopy_v3
DstSlice
mDstSlice
;
};
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
class
SrcDesc
,
class
DstDesc
,
class
SliceLengths
,
class
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v4r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
typename
TensorCoordinate_v2
<
SrcDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate_v2
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
(
SrcCoord
src_slice_origin
,
DstCoord
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
DimAccessOrder
::
Size
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>
{},
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// TODO:: sanity-check if vectorized memory access is allowed on src and dst
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
()
:
ThreadwiseGenericTensorSliceCopy_v4r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoord
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoord
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// set 0
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_long_vector
[
i
]
=
0
;
}
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// check for padding
// TODO: still kind of messy
if
(
!
src_coord
.
IsAnyLevelIndexInPaddingArea
())
{
const
index_t
src_offset
=
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
)).
GetOffset
();
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoord
mSrcSliceOrigin
;
DstCoord
mDstSliceOrigin
;
};
}
// namespace ck
#endif
composable_kernel/include/utility/array.hpp
View file @
724e984b
...
...
@@ -23,20 +23,9 @@ struct Array
static_assert
(
sizeof
...(
Xs
)
+
1
==
NSize
,
"wrong! size"
);
}
#if 0
template <typename T>
__host__ __device__ explicit constexpr Array(const T& x)
{
static_assert(T::Size() == NSize, "wrong! size");
static_for<0, NSize, 1>{}([&](auto i){
mData[i] = x.At(i);
})
}
#endif
__host__
__device__
static
constexpr
index_t
Size
()
{
return
NSize
;
}
// TODO: remove
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
Size
();
}
template
<
index_t
I
>
...
...
@@ -265,8 +254,8 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
return
new_array
;
}
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
// emulate constepxr lambda for array
// math
// emulate constepxr lambda for array
template
<
typename
F
,
typename
X
,
typename
Y
,
typename
Z
>
struct
lambda_array_math
{
const
F
&
f
;
...
...
composable_kernel/include/utility/array_helper.hpp
View file @
724e984b
...
...
@@ -5,8 +5,8 @@
namespace
ck
{
template
<
typename
T
,
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
T
,
NSize
>
a
)
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
unsigned_t
,
NSize
>
a
)
{
constexpr
index_t
nsize
=
a
.
GetSize
();
...
...
@@ -89,5 +89,89 @@ __host__ __device__ void print_array(const char* s, Array<T, NSize> a)
});
}
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
signed_t
,
NSize
>
a
)
{
constexpr
index_t
nsize
=
a
.
GetSize
();
static_assert
(
nsize
>
0
&&
nsize
<=
10
,
"wrong!"
);
static_if
<
nsize
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d}
\n
"
,
s
,
nsize
,
a
[
0
]);
});
static_if
<
nsize
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
]);
});
static_if
<
nsize
==
3
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
]);
});
static_if
<
nsize
==
4
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
]);
});
static_if
<
nsize
==
5
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
]);
});
static_if
<
nsize
==
6
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
]);
});
static_if
<
nsize
==
7
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
]);
});
static_if
<
nsize
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
]);
});
static_if
<
nsize
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
]);
});
static_if
<
nsize
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s size %d, {%d %d %d %d %d %d %d %d %d %d}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
],
a
[
9
]);
});
}
}
// namespace ck
#endif
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
724e984b
...
...
@@ -15,6 +15,15 @@
namespace ck {
using unsigned_t = uint32_t;
using signed_t = int;
#if 0 // debug
using index_t = unsigned_t;
#else
using index_t = signed_t;
#endif
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
...
...
@@ -22,8 +31,6 @@ namespace ck {
using float2_t = float2;
using float4_t = float4;
using index_t = uint32_t;
template <class T>
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
{
...
...
composable_kernel/include/utility/sequence.hpp
View file @
724e984b
...
...
@@ -537,11 +537,9 @@ struct sequence_unique_sort
};
template
<
typename
SeqMap
>
struct
is_valid_sequence_map
struct
is_valid_sequence_map
:
is_same
<
typename
arithmetic_sequence_gen
<
0
,
SeqMap
::
Size
(),
1
>::
type
,
typename
sequence_sort
<
SeqMap
,
math
::
less
<
index_t
>>::
type
>
{
static
constexpr
bool
value
=
is_same
<
typename
arithmetic_sequence_gen
<
0
,
SeqMap
::
Size
(),
1
>::
type
,
typename
sequence_sort
<
SeqMap
,
math
::
less
<
index_t
>>::
type
>
{};
};
template
<
typename
SeqMap
>
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
View file @
724e984b
...
...
@@ -115,7 +115,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
4
;
#endif
#if
0
// debug
#if
1
// debug
constexpr
index_t
GridSize
=
(
N
/
NPerBlock
)
*
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
);
#else
...
...
driver/src/driver.cpp
View file @
724e984b
...
...
@@ -73,25 +73,25 @@ int main(int argc, char* argv[])
using
namespace
ck
;
#if 1
constexpr
index_t
N
=
10
;
constexpr
index_t
C
=
10
;
constexpr
index_t
HI
=
10
;
constexpr
index_t
WI
=
10
;
constexpr
index_t
K
=
1
0
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
N
=
32
;
constexpr
index_t
C
=
8
;
constexpr
index_t
HI
=
2
;
constexpr
index_t
WI
=
2
;
constexpr
index_t
K
=
1
28
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
3
;
constexpr
index_t
WPad
=
3
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
1
;
#elif 1
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
3
4
;
constexpr
index_t
WI
=
3
4
;
constexpr
index_t
HI
=
3
2
;
constexpr
index_t
WI
=
3
2
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
...
...
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
1
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
...
...
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
#if
1
#if
0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment