Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9cc63115
Commit
9cc63115
authored
Aug 25, 2023
by
aska-0096
Browse files
sanity
parent
5cfe67b1
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
118 additions
and
63 deletions
+118
-63
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+2
-2
include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
...ude/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+2
-2
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+8
-2
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
...ation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+1
-20
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+7
-15
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+98
-22
No files found.
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
View file @
9cc63115
...
@@ -91,14 +91,14 @@ using DeviceOpInstance =
...
@@ -91,14 +91,14 @@ using DeviceOpInstance =
16
,
16
,
2
,
2
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
8
,
8
,
8
,
8
,
false
,
false
,
S
<
4
,
32
,
1
>
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
View file @
9cc63115
...
@@ -67,8 +67,8 @@ struct BlockwiseGemmWMMA
...
@@ -67,8 +67,8 @@ struct BlockwiseGemmWMMA
// When use LDS, each Row(16 consecutive lanes) read whole data from source buffer
// When use LDS, each Row(16 consecutive lanes) read whole data from source buffer
// When not use LDS, each Row read half of whole data from source buffer, exchange the data via
// When not use LDS, each Row read half of whole data from source buffer, exchange the data via
// permutation
// permutation
static
constexpr
index_t
A_KRow
=
AEnableLds
?
2
:
2
;
static
constexpr
index_t
A_KRow
=
2
;
static
constexpr
index_t
B_KRow
=
BEnableLds
?
2
:
2
;
static
constexpr
index_t
B_KRow
=
2
;
static
constexpr
index_t
A_K1
=
ABlockDesc
{}.
GetLength
(
I5
);
static
constexpr
index_t
A_K1
=
ABlockDesc
{}.
GetLength
(
I5
);
static
constexpr
index_t
B_K1
=
BBlockDesc
{}.
GetLength
(
I5
);
static
constexpr
index_t
B_K1
=
BBlockDesc
{}.
GetLength
(
I5
);
...
...
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
View file @
9cc63115
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -48,6 +49,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1
...
@@ -48,6 +49,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1
static
constexpr
auto
thread_slice_lengths
=
BlockSliceLengths
{}
/
ThreadClusterLengths
{};
static
constexpr
auto
thread_slice_lengths
=
BlockSliceLengths
{}
/
ThreadClusterLengths
{};
static
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
ThreadGroupTensorSliceTransfer_v4r1
(
__device__
constexpr
ThreadGroupTensorSliceTransfer_v4r1
(
...
@@ -85,7 +89,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
...
@@ -85,7 +89,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
make_multi_index
(
ThreadGroup
::
GetThreadId
()));
make_multi_index
(
ThreadGroup
::
GetThreadId
()));
const
auto
thread_data_idx_begin
=
thread_cluster_idx
*
thread_slice_lengths
;
// This line result in non-packed reading.
const
auto
thread_data_idx_begin
=
thread_cluster_idx
*
src_scalar_per_access
;
threadwise_transfer_
.
SetSrcSliceOrigin
(
src_desc
,
threadwise_transfer_
.
SetSrcSliceOrigin
(
src_desc
,
src_block_slice_origin
+
thread_data_idx_begin
);
src_block_slice_origin
+
thread_data_idx_begin
);
...
@@ -152,7 +157,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
...
@@ -152,7 +157,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
using
ThreadwiseTransfer
=
ThreadwiseTensorSliceTransfer_v3r1
<
decltype
(
thread_slice_lengths
),
ThreadwiseTensorSliceTransfer_v3r1
<
ThreadClusterLengths
,
decltype
(
thread_slice_lengths
),
SrcElementwiseOperation
,
SrcElementwiseOperation
,
DstElementwiseOperation
,
DstElementwiseOperation
,
DstInMemOp
,
DstInMemOp
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
View file @
9cc63115
...
@@ -498,16 +498,10 @@ struct GridwiseGemmMultipleD_Wmma
...
@@ -498,16 +498,10 @@ struct GridwiseGemmMultipleD_Wmma
if
constexpr
(
AEnableLds
)
if
constexpr
(
AEnableLds
)
{
{
// AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
// AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
// Debug this part
constexpr
auto
A_KRow
=
2
;
constexpr
auto
A_KRow
=
2
;
constexpr
auto
A_K0PerRow
=
ABlockDesc_
{}.
GetLength
(
I0
)
/
A_KRow
;
constexpr
auto
A_K0PerRow
=
ABlockDesc_
{}.
GetLength
(
I0
)
/
A_KRow
;
constexpr
auto
A_K1
=
ABlockDesc_
{}.
GetLength
(
I2
);
constexpr
auto
A_K1
=
ABlockDesc_
{}.
GetLength
(
I2
);
// return make_naive_tensor_descriptor_packed(make_tuple(Number<A_K0PerRow>{},
// Number<MRepeat>{},
// Number<MWaves>{},
// Number<A_KRow>{},
// Number<MPerWmma>{},
// Number<A_K1>{}));
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
ABlockDesc_
{},
ABlockDesc_
{},
make_tuple
(
make_tuple
(
...
@@ -545,7 +539,6 @@ struct GridwiseGemmMultipleD_Wmma
...
@@ -545,7 +539,6 @@ struct GridwiseGemmMultipleD_Wmma
if
constexpr
(
BEnableLds
)
if
constexpr
(
BEnableLds
)
{
{
// BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
// BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
#if 1
constexpr
auto
B_KRow
=
2
;
constexpr
auto
B_KRow
=
2
;
constexpr
auto
B_K0PerRow
=
BBlockDesc_
{}.
GetLength
(
I0
)
/
B_KRow
;
constexpr
auto
B_K0PerRow
=
BBlockDesc_
{}.
GetLength
(
I0
)
/
B_KRow
;
constexpr
auto
B_K1
=
BBlockDesc_
{}.
GetLength
(
I2
);
constexpr
auto
B_K1
=
BBlockDesc_
{}.
GetLength
(
I2
);
...
@@ -558,18 +551,6 @@ struct GridwiseGemmMultipleD_Wmma
...
@@ -558,18 +551,6 @@ struct GridwiseGemmMultipleD_Wmma
make_pass_through_transform
(
Number
<
B_K1
>
{})),
make_pass_through_transform
(
Number
<
B_K1
>
{})),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
,
3
>
{},
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
5
>
{}));
make_tuple
(
Sequence
<
0
,
3
>
{},
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
5
>
{}));
#endif
#if 0
constexpr auto B_KRow = 2;
constexpr auto B_K0PerRow = BBlockDesc_{}.GetLength(I0) / B_KRow;
constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
return make_naive_tensor_descriptor_packed(make_tuple(Number<B_K0PerRow>{},
Number<NRepeat>{},
Number<NWaves>{},
Number<B_KRow>{},
Number<NPerWmma>{},
Number<B_K1>{}));
#endif
}
}
else
else
{
{
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
9cc63115
...
@@ -1137,15 +1137,10 @@ struct ThreadwiseTensorSliceTransfer_v4
...
@@ -1137,15 +1137,10 @@ struct ThreadwiseTensorSliceTransfer_v4
move_tensor_coordinate
(
src_desc
,
src_data_coord
,
src_ref_to_data_disp_coord_step
);
move_tensor_coordinate
(
src_desc
,
src_data_coord
,
src_ref_to_data_disp_coord_step
);
#if 0
#if 0
printf("Tid: %03d,
Inele_Offset: %d, Coord: (%d, %d, %d, %d, %d, %d)
\n",
printf("Tid: %03d,
LDS read bank: %ld, Inele_Offset: %d
\n",
get_thread_local_1d_id(),
get_thread_local_1d_id(),
src_data_coord.GetOffset(),
(src_data_coord.GetOffset()*sizeof(SrcData)/4) %32,
src_data_coord.GetIndex().At(Number<0>{}),
src_data_coord.GetOffset());
src_data_coord.GetIndex().At(Number<1>{}),
src_data_coord.GetIndex().At(Number<2>{}),
src_data_coord.GetIndex().At(Number<3>{}),
src_data_coord.GetIndex().At(Number<4>{}),
src_data_coord.GetIndex().At(Number<5>{}));
#endif
#endif
vector_type_maker_t
<
SrcData
,
SrcScalarPerVector
>
src_tmp_vector
;
vector_type_maker_t
<
SrcData
,
SrcScalarPerVector
>
src_tmp_vector
;
...
@@ -1188,12 +1183,6 @@ struct ThreadwiseTensorSliceTransfer_v4
...
@@ -1188,12 +1183,6 @@ struct ThreadwiseTensorSliceTransfer_v4
dst_buf
(
Number
<
dst_offset
>
{})
=
dst_tmp_vector
.
template
AsType
<
DstData
>()[
i
];
dst_buf
(
Number
<
dst_offset
>
{})
=
dst_tmp_vector
.
template
AsType
<
DstData
>()[
i
];
});
});
#if 0
printf("Tid: %03d, Inele_Offset: %d\n",
get_thread_local_1d_id(),
dst_desc.CalculateOffset(
dst_origin_idx + data_to_origin_disp_idx));
#endif
});
});
}
}
...
@@ -1410,7 +1399,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
...
@@ -1410,7 +1399,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
constexpr
index_t
dst_offset
=
dst_desc
.
CalculateOffset
(
constexpr
index_t
dst_offset
=
dst_desc
.
CalculateOffset
(
dst_slice_origin_idx
+
idx_md
+
i
*
dst_scalar_step_in_vector
);
dst_slice_origin_idx
+
idx_md
+
i
*
dst_scalar_step_in_vector
);
SrcData
v_this_row
,
v_theother_row
;
SrcData
v_this_row
;
// int type temp value due to intrinsic requirement
// int type temp value due to intrinsic requirement
int
temp
=
0
;
int
temp
=
0
;
...
@@ -1425,6 +1414,8 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
...
@@ -1425,6 +1414,8 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
v_this_row
=
type_convert_sp
<
SrcData
>
(
temp
);
v_this_row
=
type_convert_sp
<
SrcData
>
(
temp
);
}
}
dst_buf
(
Number
<
dst_offset
>
{})
=
type_convert_sp
<
DstData
>
(
v_this_row
);
#if 0
// apply inter-row permute.
// apply inter-row permute.
temp = __builtin_amdgcn_permlanex16(temp,
temp = __builtin_amdgcn_permlanex16(temp,
type_convert_sp<int>(v_this_row),
type_convert_sp<int>(v_this_row),
...
@@ -1448,6 +1439,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
...
@@ -1448,6 +1439,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
type_convert_sp<DstData>(v_this_row);
type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
}
}
#endif
});
});
});
});
}
}
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
9cc63115
...
@@ -49,7 +49,8 @@ struct lambda_scalar_per_access_for_src_and_dst
...
@@ -49,7 +49,8 @@ struct lambda_scalar_per_access_for_src_and_dst
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 2. SrcBuffer and DstBuffer are DynamicBuffer
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
// 4. Use thread buffer
// 4. Use thread buffer
template
<
typename
SliceLengths
,
template
<
typename
ThreadClusterLengths
,
typename
SliceLengths
,
typename
SrcElementwiseOperation
,
typename
SrcElementwiseOperation
,
typename
DstElementwiseOperation
,
typename
DstElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
...
@@ -134,13 +135,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -134,13 +135,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
ordered_src_access_lengths
=
constexpr
auto
ordered_src_access_lengths
=
container_reorder_given_new2old
(
src_access_lengths
,
src_dim_access_order
);
container_reorder_given_new2old
(
src_access_lengths
,
src_dim_access_order
);
constexpr
auto
src_move_per_access
=
ThreadClusterLengths
{}
*
src_scalar_per_access
;
// make forward steps
// make forward steps
const
auto
src_forward_steps
=
generate_tuple
(
const
auto
src_forward_steps
=
generate_tuple
(
[
&
](
auto
i
)
{
[
&
](
auto
i
)
{
Index
forward_step_idx
;
Index
forward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
src_scalar_per_access
[
i
]
:
0
;
// Move on block-wise instead of thread-wise
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
src_move_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
src_desc
,
forward_step_idx
);
return
make_tensor_coordinate_step
(
src_desc
,
forward_step_idx
);
...
@@ -153,7 +156,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -153,7 +156,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
backward_step_idx
;
Index
backward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
src_
scalar
_per_access
[
i
]
:
0
;
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
src_
move
_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
src_desc
,
backward_step_idx
);
return
make_tensor_coordinate_step
(
src_desc
,
backward_step_idx
);
...
@@ -194,7 +197,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -194,7 +197,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
src_scalar_per_access
;
src_scalar_per_access
;
}();
}();
#if 0
printf("Tid: %03d, global buf offset inbyte: %ld, inele offset: %d, coord: (%d, %d, %d)\n", get_thread_local_1d_id(),
src_coord_.GetOffset()*sizeof(DstData), src_coord_.GetOffset(),
src_coord_.GetIndex().At(Number<0>{}), src_coord_.GetIndex().At(Number<1>{}), src_coord_.GetIndex().At(Number<2>{}));
#endif
constexpr
auto
src_data_idx_seq
=
generate_sequence_v2
(
constexpr
auto
src_data_idx_seq
=
generate_sequence_v2
(
[
&
](
auto
i
)
{
return
Number
<
src_data_idx
[
i
]
>
{};
},
Number
<
src_data_idx
.
Size
()
>
{});
[
&
](
auto
i
)
{
return
Number
<
src_data_idx
[
i
]
>
{};
},
Number
<
src_data_idx
.
Size
()
>
{});
...
@@ -369,6 +376,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -369,6 +376,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_move_per_access
=
ThreadClusterLengths
{}
*
dst_scalar_per_access
;
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
...
@@ -382,7 +391,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -382,7 +391,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
forward_step_idx
;
Index
forward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
dst_
scalar
_per_access
[
i
]
:
0
;
forward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
dst_
move
_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
dst_desc
,
forward_step_idx
);
return
make_tensor_coordinate_step
(
dst_desc
,
forward_step_idx
);
...
@@ -395,7 +404,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -395,7 +404,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
backward_step_idx
;
Index
backward_step_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
dst_
scalar
_per_access
[
i
]
:
0
;
backward_step_idx
(
j
)
=
(
i
.
value
==
j
.
value
)
?
-
dst_
move
_per_access
[
i
]
:
0
;
});
});
return
make_tensor_coordinate_step
(
dst_desc
,
backward_step_idx
);
return
make_tensor_coordinate_step
(
dst_desc
,
backward_step_idx
);
...
@@ -439,7 +448,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -439,7 +448,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
dst_data_idx_seq
=
generate_sequence_v2
(
constexpr
auto
dst_data_idx_seq
=
generate_sequence_v2
(
[
&
](
auto
i
)
{
return
Number
<
dst_data_idx
[
i
]
>
{};
},
Number
<
dst_data_idx
.
Size
()
>
{});
[
&
](
auto
i
)
{
return
Number
<
dst_data_idx
[
i
]
>
{};
},
Number
<
dst_data_idx
.
Size
()
>
{});
#if 0
printf("Tid: %03d, LDS write bank: %ld, inele offset: %04d, coord (%d, %d, %d)\n", get_thread_local_1d_id(),
(dst_coord_.GetOffset()*sizeof(DstData)/4 )%32, dst_coord_.GetOffset(),
dst_coord_.GetIndex().At(Number<0>{}), dst_coord_.GetIndex().At(Number<1>{}), dst_coord_.GetIndex().At(Number<2>{}));
#endif
const
bool
is_dst_valid
=
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
...
@@ -505,7 +518,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -505,7 +518,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
{
{
const
auto
dst_reset_step
=
const
auto
dst_reset_step
=
make_tensor_coordinate_step
(
dst_desc
,
GetDstCoordinateResetStep
());
make_tensor_coordinate_step
(
dst_desc
,
GetDstCoordinateResetStep
());
#if 0
const auto dst_reset_idx = GetDstCoordinateResetStep();
if (get_thread_local_1d_id()==0)
{
printf("dst_reset_step: %d, %d, %d\n",
dst_reset_idx.At(Number<0>{}),
dst_reset_idx.At(Number<1>{}),
dst_reset_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_step
);
move_tensor_coordinate
(
dst_desc
,
dst_coord_
,
dst_reset_step
);
}
}
}
}
...
@@ -517,12 +539,19 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -517,12 +539,19 @@ struct ThreadwiseTensorSliceTransfer_v3r1
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
constexpr
auto
src_access_unit
=
SliceLengths
{}
/
src_scalar_per_access
;
constexpr
auto
src_access_unit_helper
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
SrcVectorDim
,
1
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_access_stride
=
ThreadClusterLengths
{}
*
(
src_access_unit
-
src_access_unit_helper
);
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
constexpr
auto
ordered_src_access_lengths
=
constexpr
auto
ordered_src_access_stride
=
container_reorder_given_new2old
(
src_access_lengths
,
src_dim_access_order
);
container_reorder_given_new2old
(
src_access_stride
,
src_dim_access_order
);
constexpr
auto
ordered_src_access_unit
=
container_reorder_given_new2old
(
src_access_unit
,
src_dim_access_order
);
// judge move forward or move backward during the last iteration
// judge move forward or move backward during the last iteration
constexpr
auto
forward_sweep
=
[
&
]()
{
constexpr
auto
forward_sweep
=
[
&
]()
{
...
@@ -531,10 +560,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -531,10 +560,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_
(
I0
)
=
true
;
forward_sweep_
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_src_access_
lengths
[
I0
]
-
1
;
index_t
tmp
=
ordered_src_access_
unit
[
I0
]
-
1
;
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_
lengths
[
j
]
+
ordered_src_access_
lengths
[
j
]
-
1
;
tmp
=
tmp
*
ordered_src_access_
unit
[
j
]
+
ordered_src_access_
unit
[
j
]
-
1
;
});
});
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
...
@@ -549,7 +578,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -549,7 +578,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
Index
ordered_idx
;
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_src_access_
lengths
[
i
]
-
1
:
0
;
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_src_access_
stride
[
i
]
:
0
;
});
});
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
return
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
...
@@ -564,7 +593,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -564,7 +593,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return
reset_src_data_step_
;
return
reset_src_data_step_
;
}();
}();
#if 0
if (get_thread_local_1d_id()==0)
{
printf("ordered_src_access_stride: %d, %d, %d | src_data_idx: %d, %d, %d\n",
ordered_src_access_stride.At(Number<0>{}).value,
ordered_src_access_stride.At(Number<1>{}).value,
ordered_src_access_stride.At(Number<2>{}).value,
src_data_idx.At(Number<0>{}),
src_data_idx.At(Number<1>{}),
src_data_idx.At(Number<2>{}));
}
#endif
return
reset_src_data_step
;
return
reset_src_data_step
;
}
}
...
@@ -574,13 +614,29 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -574,13 +614,29 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// TODO: don't use lambda_scalar_per_access
// TODO: don't use lambda_scalar_per_access
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_access_unit
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_access_unit_helper
=
generate_sequence
(
detail
::
lambda_scalar_per_access
<
DstVectorDim
,
1
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_access_strides
=
ThreadClusterLengths
{}
*
(
dst_access_unit
-
dst_access_unit_helper
);
#if 0
if (get_thread_local_1d_id()==0)
{
printf("dst_access_strides: %d, %d, %d\n",
dst_access_strides.At(Number<0>{}).value,
dst_access_strides.At(Number<1>{}).value,
dst_access_strides.At(Number<2>{}).value);
}
#endif
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
ordered_dst_access_lengths
=
constexpr
auto
ordered_dst_access_strides
=
container_reorder_given_new2old
(
dst_access_lengths
,
dst_dim_access_order
);
container_reorder_given_new2old
(
dst_access_strides
,
dst_dim_access_order
);
constexpr
auto
ordered_dst_access_unit
=
container_reorder_given_new2old
(
dst_access_unit
,
dst_dim_access_order
);
// judge move forward or move backward during the last iteration
// judge move forward or move backward during the last iteration
constexpr
auto
forward_sweep
=
[
&
]()
{
constexpr
auto
forward_sweep
=
[
&
]()
{
...
@@ -589,10 +645,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -589,10 +645,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_
(
I0
)
=
true
;
forward_sweep_
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_
lengths
[
I0
]
-
1
;
index_t
tmp
=
ordered_dst_access_
unit
[
I0
]
-
1
;
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_
lengths
[
j
]
+
ordered_dst_access_
lengths
[
j
]
-
1
;
tmp
=
tmp
*
ordered_dst_access_
unit
[
j
]
+
ordered_dst_access_
unit
[
j
]
-
1
;
});
});
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
forward_sweep_
(
i
)
=
tmp
%
2
==
0
;
...
@@ -600,14 +656,22 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -600,14 +656,22 @@ struct ThreadwiseTensorSliceTransfer_v3r1
return
forward_sweep_
;
return
forward_sweep_
;
}();
}();
#if 0
if (get_thread_local_1d_id()==0)
{
printf("forward_sweep: %d, %d, %d\n",
forward_sweep[Number<0>{}],
forward_sweep[Number<1>{}],
forward_sweep[Number<2>{}]);
}
#endif
// calculate dst data index after last iteration in RunWrite(), if it has not being reset by
// calculate dst data index after last iteration in RunWrite(), if it has not being reset by
// RunWrite()
// RunWrite()
constexpr
auto
dst_data_idx
=
[
&
]()
{
constexpr
auto
dst_data_idx
=
[
&
]()
{
Index
ordered_idx
;
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_dst_access_
lengths
[
i
]
-
1
:
0
;
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_dst_access_
strides
[
i
]
:
0
;
});
});
return
container_reorder_given_old2new
(
ordered_idx
,
dst_dim_access_order
)
*
return
container_reorder_given_old2new
(
ordered_idx
,
dst_dim_access_order
)
*
...
@@ -637,6 +701,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -637,6 +701,18 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
#if 0
if (get_thread_local_1d_id()==0)
{
printf("InputSrcStep: %d, %d, %d | MoveSrcSliceWindowStep: %d, %d, %d\n",
src_slice_origin_step_idx.At(Number<0>{}),
src_slice_origin_step_idx.At(Number<1>{}),
src_slice_origin_step_idx.At(Number<2>{}),
adjusted_step_idx.At(Number<0>{}),
adjusted_step_idx.At(Number<1>{}),
adjusted_step_idx.At(Number<2>{}));
}
#endif
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment