Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a97298a4
Commit
a97298a4
authored
Aug 22, 2023
by
aska-0096
Browse files
Functional work
parent
cc0ffeb7
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
123 additions
and
51 deletions
+123
-51
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+16
-10
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
+1
-1
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
...ple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+4
-4
include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
...ude/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+12
-12
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+8
-2
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+6
-1
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+76
-21
No files found.
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
View file @
a97298a4
...
@@ -12,6 +12,7 @@
...
@@ -12,6 +12,7 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
...
@@ -83,30 +84,30 @@ using DeviceOpInstance =
...
@@ -83,30 +84,30 @@ using DeviceOpInstance =
1
,
1
,
128
,
128
,
64
,
64
,
128
,
64
,
64
,
64
,
8
,
4
,
16
,
16
,
16
,
16
,
1
,
2
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
4
,
8
,
4
,
8
,
tru
e
,
fals
e
,
S
<
4
,
32
,
1
>
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
4
,
8
,
4
,
8
,
tru
e
,
fals
e
,
1
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
2
>
,
S
<
1
,
32
,
1
,
4
>
,
8
>
;
8
>
;
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
...
@@ -208,6 +209,11 @@ int main(int argc, char* argv[])
...
@@ -208,6 +209,11 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
DDataType
>
{
-
5
,
5
});
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
DDataType
>
{
-
5
,
5
});
break
;
break
;
case
2
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
1.
f
,
1.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
1.
f
,
1.
f
}(
b_k_n
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
DDataType
>
{
1.
f
,
1.
f
}(
d_m_n
);
break
;
default:
default:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
...
...
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
View file @
a97298a4
...
@@ -39,7 +39,7 @@ using S = ck::Sequence<Is...>;
...
@@ -39,7 +39,7 @@ using S = ck::Sequence<Is...>;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvSpec
=
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
...
...
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
View file @
a97298a4
...
@@ -54,13 +54,13 @@ using DeviceConvFwdInstance =
...
@@ -54,13 +54,13 @@ using DeviceConvFwdInstance =
1
,
// Prefetch stage
1
,
// Prefetch stage
128
,
// BlockSize
128
,
// BlockSize
64
,
// MPerBlock
64
,
// MPerBlock
64
,
// NPerBlock
128
,
// NPerBlock
64
,
// KPerBlock
64
,
// KPerBlock
8
,
// K1
8
,
// K1
16
,
// MPerWMMA
16
,
// MPerWMMA
16
,
// NPerWMMA
16
,
// NPerWMMA
4
,
// MRepeat
2
,
// MRepeat
1
,
// NRepeat
4
,
// NRepeat
S
<
4
,
32
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
4
,
32
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
...
@@ -77,7 +77,7 @@ using DeviceConvFwdInstance =
...
@@ -77,7 +77,7 @@ using DeviceConvFwdInstance =
true
,
// BBlockLdsExtraN
true
,
// BBlockLdsExtraN
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
S
<
1
,
32
,
1
,
4
>
,
8
>
;
8
>
;
template
<
ck
::
index_t
NDimSpatial
>
template
<
ck
::
index_t
NDimSpatial
>
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
View file @
a97298a4
...
@@ -362,18 +362,18 @@ struct BlockwiseGemmWMMA
...
@@ -362,18 +362,18 @@ struct BlockwiseGemmWMMA
}
}
else
else
{
{
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
static_for
<
0
,
KPerBlock
/
WmmaK
,
1
>
{}([
&
](
auto
k
)
{
// k=0,1,2 instead of
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
// k=0,kpack*1, ..
static_for
<
0
,
KPerBlock
/
WmmaK
,
1
>
{}([
&
](
auto
k
)
{
// k=0,1,2 instead of
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
//
k=0,kpack*1, ..
//
read B
// read B
b_thread_copy_
.
Run
(
b_
thread_copy_
.
Run
(
b_
block_desc_k0_n0_n1_n2_k1
,
b_block_desc_k0_n0_n1_n2_k1
,
make_tuple
(
Number
<
k
*
WmmaK
/
B_K1
/
B_KRow
>
{},
n0
,
I0
,
I0
,
I0
,
I0
)
,
make_tuple
(
Number
<
k
*
WmmaK
/
B_K1
/
B_KRow
>
{},
n0
,
I0
,
I0
,
I0
,
I0
)
,
b_block_buf
,
b_block_buf
,
b_thread_desc_
,
b_thread_desc_
,
make_tuple
(
I0
,
n0
,
I0
,
I0
,
I0
,
I0
)
,
make_tuple
(
I0
,
n0
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
);
b_thread_buf
);
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
// read A
// read A
a_thread_copy_
.
Run
(
a_thread_copy_
.
Run
(
a_block_desc_k0_m0_m1_m2_k1
,
a_block_desc_k0_m0_m1_m2_k1
,
...
...
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
View file @
a97298a4
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -48,6 +49,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1
...
@@ -48,6 +49,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1
static
constexpr
auto
thread_slice_lengths
=
BlockSliceLengths
{}
/
ThreadClusterLengths
{};
static
constexpr
auto
thread_slice_lengths
=
BlockSliceLengths
{}
/
ThreadClusterLengths
{};
static
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
ThreadGroupTensorSliceTransfer_v4r1
(
__device__
constexpr
ThreadGroupTensorSliceTransfer_v4r1
(
...
@@ -85,7 +89,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
...
@@ -85,7 +89,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
make_multi_index
(
ThreadGroup
::
GetThreadId
()));
make_multi_index
(
ThreadGroup
::
GetThreadId
()));
const
auto
thread_data_idx_begin
=
thread_cluster_idx
*
thread_slice_lengths
;
// This line result in non-packed reading.
const
auto
thread_data_idx_begin
=
thread_cluster_idx
*
src_scalar_per_access
;
threadwise_transfer_
.
SetSrcSliceOrigin
(
src_desc
,
threadwise_transfer_
.
SetSrcSliceOrigin
(
src_desc
,
src_block_slice_origin
+
thread_data_idx_begin
);
src_block_slice_origin
+
thread_data_idx_begin
);
...
@@ -152,7 +157,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
...
@@ -152,7 +157,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
using
ThreadwiseTransfer
=
ThreadwiseTensorSliceTransfer_v3r1
<
decltype
(
thread_slice_lengths
),
ThreadwiseTensorSliceTransfer_v3r1
<
ThreadClusterLengths
,
decltype
(
thread_slice_lengths
),
SrcElementwiseOperation
,
SrcElementwiseOperation
,
DstElementwiseOperation
,
DstElementwiseOperation
,
DstInMemOp
,
DstInMemOp
,
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
a97298a4
...
@@ -1136,7 +1136,12 @@ struct ThreadwiseTensorSliceTransfer_v4
...
@@ -1136,7 +1136,12 @@ struct ThreadwiseTensorSliceTransfer_v4
auto
src_data_coord
=
src_ref_coord_
;
auto
src_data_coord
=
src_ref_coord_
;
move_tensor_coordinate
(
src_desc
,
src_data_coord
,
src_ref_to_data_disp_coord_step
);
move_tensor_coordinate
(
src_desc
,
src_data_coord
,
src_ref_to_data_disp_coord_step
);
#if 0
printf("Tid: %03d, LDS read bank: %ld, inele offset: %d, coord: (%d, %d, %d, %d, %d, %d)\n", get_thread_local_1d_id(),
(src_data_coord.GetOffset()*sizeof(SrcData)/4 )%32, src_data_coord.GetOffset(),
src_data_coord.GetIndex().At(Number<0>{}), src_data_coord.GetIndex().At(Number<1>{}), src_data_coord.GetIndex().At(Number<2>{})
, src_data_coord.GetIndex().At(Number<3>{}) , src_data_coord.GetIndex().At(Number<4>{}), src_data_coord.GetIndex().At(Number<5>{}));
#endif
vector_type_maker_t
<
SrcData
,
SrcScalarPerVector
>
src_tmp_vector
;
vector_type_maker_t
<
SrcData
,
SrcScalarPerVector
>
src_tmp_vector
;
using
src_vector_t
=
typename
decltype
(
src_tmp_vector
)
::
type
;
using
src_vector_t
=
typename
decltype
(
src_tmp_vector
)
::
type
;
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
a97298a4
...
@@ -49,7 +49,8 @@ struct lambda_scalar_per_access_for_src_and_dst
...
@@ -49,7 +49,8 @@ struct lambda_scalar_per_access_for_src_and_dst
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
// 4. Use thread buffer
// 4. Use thread buffer
template
<
typename
SliceLengths
,
template
<
typename
ThreadClusterLengths
,
typename
SliceLengths
,
typename
SrcElementwiseOperation
,
typename
SrcElementwiseOperation
,
typename
DstElementwiseOperation
,
typename
DstElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
...
@@ -134,13 +135,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -134,13 +135,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
ordered_src_access_lengths
=
constexpr
auto
ordered_src_access_lengths
=
container_reorder_given_new2old
(
src_access_lengths
,
src_dim_access_order
);
container_reorder_given_new2old
(
src_access_lengths
,
src_dim_access_order
);
constexpr
auto
src_move_per_access
=
ThreadClusterLengths
{}
*
src_scalar_per_access
;
// make forward steps
// make forward steps
const
auto
src_forward_steps
=
generate_tuple
(
const
auto
src_forward_steps
=
generate_tuple
(
[
&
](
auto
i
)
{
[
&
](
auto
i
)
{
Index
forward_step_idx
;
Index
forward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
src_scalar_per_access
[
i
]
:
0
;
// Move on block-wise instead of thread-wise
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
src_move_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
src_desc
,
forward_step_idx
);
return
make_tensor_coordinate_step
(
src_desc
,
forward_step_idx
);
...
@@ -153,7 +156,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -153,7 +156,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
backward_step_idx
;
Index
backward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
src_
scalar
_per_access
[
i
]
:
0
;
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
src_
move
_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
src_desc
,
backward_step_idx
);
return
make_tensor_coordinate_step
(
src_desc
,
backward_step_idx
);
...
@@ -194,7 +197,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -194,7 +197,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
src_scalar_per_access
;
src_scalar_per_access
;
}();
}();
#if 0
printf("Tid: %03d, global buf offset inbyte: %ld, inele offset: %d, coord: (%d, %d, %d)\n", get_thread_local_1d_id(),
src_coord_.GetOffset()*sizeof(DstData), src_coord_.GetOffset(),
src_coord_.GetIndex().At(Number<0>{}), src_coord_.GetIndex().At(Number<1>{}), src_coord_.GetIndex().At(Number<2>{}));
#endif
constexpr
auto
src_data_idx_seq
=
generate_sequence_v2
(
constexpr
auto
src_data_idx_seq
=
generate_sequence_v2
(
[
&
](
auto
i
)
{
return
Number
<
src_data_idx
[
i
]
>
{};
},
Number
<
src_data_idx
.
Size
()
>
{});
[
&
](
auto
i
)
{
return
Number
<
src_data_idx
[
i
]
>
{};
},
Number
<
src_data_idx
.
Size
()
>
{});
...
@@ -369,6 +376,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -369,6 +376,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_move_per_access
=
ThreadClusterLengths
{}
*
dst_scalar_per_access
;
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
...
@@ -382,7 +391,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -382,7 +391,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
forward_step_idx
;
Index
forward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
dst_
scalar
_per_access
[
i
]
:
0
;
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
dst_
move
_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
dst_desc
,
forward_step_idx
);
return
make_tensor_coordinate_step
(
dst_desc
,
forward_step_idx
);
...
@@ -395,7 +404,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -395,7 +404,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
backward_step_idx
;
Index
backward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
dst_
scalar
_per_access
[
i
]
:
0
;
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
dst_
move
_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
dst_desc
,
backward_step_idx
);
return
make_tensor_coordinate_step
(
dst_desc
,
backward_step_idx
);
...
@@ -439,7 +448,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -439,7 +448,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
dst_data_idx_seq
=
generate_sequence_v2
(
constexpr
auto
dst_data_idx_seq
=
generate_sequence_v2
(
[
&
](
auto
i
)
{
return
Number
<
dst_data_idx
[
i
]
>
{};
},
Number
<
dst_data_idx
.
Size
()
>
{});
[
&
](
auto
i
)
{
return
Number
<
dst_data_idx
[
i
]
>
{};
},
Number
<
dst_data_idx
.
Size
()
>
{});
#if 0
printf("Tid: %03d, LDS write bank: %ld, inele offset: %04d, coord (%d, %d, %d)\n", get_thread_local_1d_id(),
(dst_coord_.GetOffset()*sizeof(DstData)/4 )%32, dst_coord_.GetOffset(),
dst_coord_.GetIndex().At(Number<0>{}), dst_coord_.GetIndex().At(Number<1>{}), dst_coord_.GetIndex().At(Number<2>{}));
#endif
const
bool
is_dst_valid
=
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
...
@@ -505,7 +518,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -505,7 +518,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
{
{
const
auto
dst_reset_step
=
const
auto
dst_reset_step
=
make_tensor_coordinate_step
(
dst_desc
,
GetDstCoordinateResetStep
());
make_tensor_coordinate_step
(
dst_desc
,
GetDstCoordinateResetStep
());
#if 0
const auto dst_reset_idx = GetDstCoordinateResetStep();
if (get_thread_local_1d_id()==0)
{
printf("dst_reset_step: %d, %d, %d\n",
dst_reset_idx.At(Number<0>{}),
dst_reset_idx.At(Number<1>{}),
dst_reset_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_step
);
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_step
);
}
}
}
}
...
@@ -517,12 +539,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -517,12 +539,17 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
constexpr
auto
src_access_unit
=
SliceLengths
{}
/
src_scalar_per_access
;
constexpr
auto
src_access_unit_helper
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
1
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_access_stride
=
ThreadClusterLengths
{}
*
(
src_access_unit
-
src_access_unit_helper
);
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
constexpr
auto
ordered_src_access_
lengths
=
constexpr
auto
ordered_src_access_
stride
=
container_reorder_given_new2old
(
src_access_
lengths
,
src_dim_access_order
);
container_reorder_given_new2old
(
src_access_
stride
,
src_dim_access_order
);
// judge move forward or move backward during the last iteration
// judge move forward or move backward during the last iteration
constexpr
auto
forward_sweep
=
[
&
]()
{
constexpr
auto
forward_sweep
=
[
&
]()
{
...
@@ -531,10 +558,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -531,10 +558,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_
(
I0
)
=
true
;
forward_sweep_
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_src_access_
lengths
[
I0
]
-
1
;
index_t
tmp
=
ordered_src_access_
stride
[
I0
]
-
1
;
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_
lengths
[
j
]
+
ordered_src_access_
lengths
[
j
]
-
1
;
tmp
=
tmp
*
ordered_src_access_
stride
[
j
]
+
ordered_src_access_
stride
[
j
]
-
1
;
});
});
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
...
@@ -549,7 +576,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -549,7 +576,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
ordered_idx
;
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_src_access_
lengths
[
i
]
-
1
:
0
;
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_src_access_
stride
[
i
]
:
0
;
});
});
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
...
@@ -564,7 +591,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -564,7 +591,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return
reset_src_data_step_
;
return
reset_src_data_step_
;
}();
}();
#if 0
if (get_thread_local_1d_id()==0)
{
printf("ordered_src_access_stride: %d, %d, %d | src_data_idx: %d, %d, %d\n",
ordered_src_access_stride.At(Number<0>{}).value,
ordered_src_access_stride.At(Number<1>{}).value,
ordered_src_access_stride.At(Number<2>{}).value,
src_data_idx.At(Number<0>{}),
src_data_idx.At(Number<1>{}),
src_data_idx.At(Number<2>{}));
}
#endif
return
reset_src_data_step
;
return
reset_src_data_step
;
}
}
...
@@ -574,13 +612,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -574,13 +612,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// TODO: don't use lambda_scalar_per_access
// TODO: don't use lambda_scalar_per_access
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_access_unit
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_access_unit_helper
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
1
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_access_strides
=
ThreadClusterLengths
{}
*
(
dst_access_unit
-
dst_access_unit_helper
);
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
ordered_dst_access_
length
s
=
constexpr
auto
ordered_dst_access_
stride
s
=
container_reorder_given_new2old
(
dst_access_
length
s
,
dst_dim_access_order
);
container_reorder_given_new2old
(
dst_access_
stride
s
,
dst_dim_access_order
);
// judge move forward or move backward during the last iteration
// judge move forward or move backward during the last iteration
constexpr
auto
forward_sweep
=
[
&
]()
{
constexpr
auto
forward_sweep
=
[
&
]()
{
...
@@ -589,10 +632,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -589,10 +632,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_
(
I0
)
=
true
;
forward_sweep_
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_
length
s
[
I0
]
-
1
;
index_t
tmp
=
ordered_dst_access_
stride
s
[
I0
]
-
1
;
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_
length
s
[
j
]
+
ordered_dst_access_
length
s
[
j
]
-
1
;
tmp
=
tmp
*
ordered_dst_access_
stride
s
[
j
]
+
ordered_dst_access_
stride
s
[
j
]
-
1
;
});
});
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
...
@@ -607,7 +650,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -607,7 +650,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
ordered_idx
;
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_dst_access_
lengths
[
i
]
-
1
:
0
;
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_dst_access_
strides
[
i
]
:
0
;
});
});
return
container_reorder_given_old2new
(
ordered_idx
,
dst_dim_access_order
)
*
return
container_reorder_given_old2new
(
ordered_idx
,
dst_dim_access_order
)
*
...
@@ -637,6 +680,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -637,6 +680,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
#if 0
if (get_thread_local_1d_id()==0)
{
printf("InputSrcStep: %d, %d, %d | MoveSrcSliceWindowStep: %d, %d, %d\n",
src_slice_origin_step_idx.At(Number<0>{}),
src_slice_origin_step_idx.At(Number<1>{}),
src_slice_origin_step_idx.At(Number<2>{}),
adjusted_step_idx.At(Number<0>{}),
adjusted_step_idx.At(Number<1>{}),
adjusted_step_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment