Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2dea900b
"...resnet50_tensorflow.git" did not exist on "a55cf4d30d5a61ca94a933d725e79011a2e83693"
Commit
2dea900b
authored
Feb 14, 2021
by
Chao Liu
Browse files
fixed GetSrcCoordinateResetStep and GetDstCoordinateResetStep in v1r3 and v3
parent
6fe9682a
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
183 additions
and
88 deletions
+183
-88
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+183
-88
No files found.
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
2dea900b
...
@@ -80,7 +80,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -80,7 +80,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
// TODO: don't use this
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
...
@@ -260,31 +261,64 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -260,31 +261,64 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
__device__
static
constexpr
auto
GetDstCoordinateResetStep
()
__device__
static
constexpr
auto
GetDstCoordinateResetStep
()
{
{
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
constexpr
auto
I0
=
Number
<
0
>
{};
Index
dst_scalar_per_access
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
// scalar per access on each dim
dst_scalar_per_access
(
i
)
=
(
i
==
DstVectorDim
)
?
DstScalarPerVector
:
1
;
// TODO: don't use lambda_scalar_per_access
});
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
return
dst_scalar_per_access
;
constexpr
auto
access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
}();
MultiIndex
<
nDim
>
dst_reset_iterator
;
constexpr
auto
dim_access_order
=
DimAccessOrder
{}
;
// TODO: this is wrong, need to consider DimAccessOrder
constexpr
auto
ordered_access_lengths
=
dst_reset_iterator
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
container_reorder_given_new2old
(
access_lengths
,
dim_access_order
);
// judge move forward or move backward during the last iteration
constexpr
auto
forward_sweep
=
[
&
]()
{
StaticallyIndexedArray
<
bool
,
nDim
>
forward_sweep
;
forward_sweep
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
index_t
tmp
=
ordered_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_lengths
[
j
]
-
1
;
});
// TODO: this is wrong
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
dst_reset_iterator
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
?
0
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
});
return
dst_reset_iterator
;
return
forward_sweep
;
}();
// calculate dst data index after last iteration in RunWrite(), if it has not being reset by
// RunWrite()
constexpr
auto
dst_data_idx
=
[
&
]()
{
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_access_lengths
[
i
]
-
1
:
0
;
});
auto
dst_data_idx
=
container_reorder_given_old2new
(
ordered_idx
,
dim_access_order
)
*
dst_scalar_per_access
;
return
dst_data_idx
;
}();
//
constexpr
auto
reset_dst_data_step
=
[
&
]()
{
Index
reset_dst_data_step
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
reset_dst_data_step
(
i
)
=
-
dst_data_idx
[
i
];
});
return
reset_dst_data_step
;
}();
return
reset_dst_data_step
;
}
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
...
@@ -385,19 +419,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -385,19 +419,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
// TODO: don't use this
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
src_scalar_step_in_vector
=
constexpr
auto
src_scalar_step_in_vector
=
generate_sequence
(
lambda_scalar_step_in_vector
<
SrcVectorDim
>
{},
Number
<
nDim
>
{});
generate_sequence
(
lambda_scalar_step_in_vector
<
SrcVectorDim
>
{},
Number
<
nDim
>
{});
constexpr
auto
access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
constexpr
auto
src_
access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{};
constexpr
auto
ordered_access_lengths
=
constexpr
auto
ordered_
src_
access_lengths
=
container_reorder_given_new2old
(
access_lengths
,
src_dim_access_order
);
container_reorder_given_new2old
(
src_
access_lengths
,
src_dim_access_order
);
// make forward iterators
// make forward iterators
const
auto
src_forward_iterators
=
generate_tuple
(
const
auto
src_forward_iterators
=
generate_tuple
(
...
@@ -428,7 +463,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -428,7 +463,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
Number
<
nDim
>
{});
Number
<
nDim
>
{});
// loop over tensor and copy
// loop over tensor and copy
static_ford
<
decltype
(
ordered_access_lengths
)
>
{}([
&
](
auto
ordered_access_idx
)
{
static_ford
<
decltype
(
ordered_
src_
access_lengths
)
>
{}([
&
](
auto
ordered_
src_
access_idx
)
{
// judge move forward or move backward
// judge move forward or move backward
constexpr
auto
forward_sweep
=
[
&
]()
{
constexpr
auto
forward_sweep
=
[
&
]()
{
...
@@ -437,10 +472,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -437,10 +472,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
forward_sweep
(
I0
)
=
true
;
forward_sweep
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_idx
[
I0
];
index_t
tmp
=
ordered_
src_
access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_idx
[
j
];
tmp
=
tmp
*
ordered_
src_
access_lengths
[
j
]
+
ordered_
src_
access_idx
[
j
];
});
});
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
...
@@ -450,19 +485,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -450,19 +485,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
}();
}();
// calculate src data index
// calculate src data index
constexpr
auto
data_idx
=
[
&
]()
{
constexpr
auto
src_
data_idx
=
[
&
]()
{
Index
ordered_idx
;
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_src_access_idx
[
i
]
?
ordered_access_
idx
[
i
]
:
ordered_
src_
access_
lengths
[
i
]
-
1
-
:
ordered_access_lengths
[
i
]
-
1
-
ordered_access_idx
[
i
];
ordered_
src_
access_idx
[
i
];
});
});
auto
data_idx
=
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
auto
src_data_idx
=
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
src_scalar_per_access
;
src_scalar_per_access
;
return
data_idx
;
return
src_
data_idx
;
}();
}();
// copy data
// copy data
...
@@ -486,7 +522,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -486,7 +522,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_desc_
.
CalculateOffset
(
src_
data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
});
...
@@ -502,7 +538,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -502,7 +538,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_desc_
.
CalculateOffset
(
src_
data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
});
...
@@ -513,10 +549,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -513,10 +549,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
StaticallyIndexedArray
<
bool
,
nDim
>
move_on_dim
;
StaticallyIndexedArray
<
bool
,
nDim
>
move_on_dim
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
move_on_dim
(
i
)
=
ordered_access_idx
[
i
]
<
ordered_access_lengths
[
i
]
-
1
;
move_on_dim
(
i
)
=
ordered_
src_
access_idx
[
i
]
<
ordered_
src_
access_lengths
[
i
]
-
1
;
static_for
<
i
+
1
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
i
+
1
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
move_on_dim
(
i
)
&=
ordered_access_idx
[
j
]
==
ordered_access_lengths
[
j
]
-
1
;
move_on_dim
(
i
)
&=
ordered_src_access_idx
[
j
]
==
ordered_src_access_lengths
[
j
]
-
1
;
});
});
});
});
...
@@ -563,6 +600,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -563,6 +600,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
// src scalar per access on each dim
// TODO: don't use this
// TODO: don't use this
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
...
@@ -570,12 +608,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -570,12 +608,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr
auto
dst_scalar_step_in_vector
=
constexpr
auto
dst_scalar_step_in_vector
=
generate_sequence
(
lambda_scalar_step_in_vector
<
DstVectorDim
>
{},
Number
<
nDim
>
{});
generate_sequence
(
lambda_scalar_step_in_vector
<
DstVectorDim
>
{},
Number
<
nDim
>
{});
constexpr
auto
access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_
access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
ordered_access_lengths
=
constexpr
auto
ordered_
dst_
access_lengths
=
container_reorder_given_new2old
(
access_lengths
,
dst_dim_access_order
);
container_reorder_given_new2old
(
dst_
access_lengths
,
dst_dim_access_order
);
// make forward iterators
// make forward iterators
const
auto
dst_forward_iterators
=
generate_tuple
(
const
auto
dst_forward_iterators
=
generate_tuple
(
...
@@ -610,7 +648,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -610,7 +648,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
Number
<
nDim
>
{});
Number
<
nDim
>
{});
// loop over tensor and copy
// loop over tensor and copy
static_ford
<
decltype
(
ordered_access_lengths
)
>
{}([
&
](
auto
ordered_access_idx
)
{
static_ford
<
decltype
(
ordered_
dst_
access_lengths
)
>
{}([
&
](
auto
ordered_
dst_
access_idx
)
{
// judge move forward or move backward
// judge move forward or move backward
constexpr
auto
forward_sweep
=
[
&
]()
{
constexpr
auto
forward_sweep
=
[
&
]()
{
...
@@ -619,10 +657,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -619,10 +657,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
forward_sweep
(
I0
)
=
true
;
forward_sweep
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_idx
[
I0
];
index_t
tmp
=
ordered_
dst_
access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_idx
[
j
];
tmp
=
tmp
*
ordered_
dst_
access_lengths
[
j
]
+
ordered_
dst_
access_idx
[
j
];
});
});
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
...
@@ -636,9 +674,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -636,9 +674,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
Index
ordered_idx
;
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_dst_access_idx
[
i
]
?
ordered_access_
idx
[
i
]
:
ordered_
dst_
access_
lengths
[
i
]
-
1
-
:
ordered_access_lengths
[
i
]
-
1
-
ordered_access_idx
[
i
];
ordered_
dst_
access_idx
[
i
];
});
});
auto
dst_data_idx
=
auto
dst_data_idx
=
...
@@ -674,10 +712,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -674,10 +712,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
StaticallyIndexedArray
<
bool
,
nDim
>
move_on_dim
;
StaticallyIndexedArray
<
bool
,
nDim
>
move_on_dim
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
move_on_dim
(
i
)
=
ordered_access_idx
[
i
]
<
ordered_access_lengths
[
i
]
-
1
;
move_on_dim
(
i
)
=
ordered_
dst_
access_idx
[
i
]
<
ordered_
dst_
access_lengths
[
i
]
-
1
;
static_for
<
i
+
1
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
i
+
1
,
nDim
,
1
>
{}([
&
](
auto
j
)
{
move_on_dim
(
i
)
&=
ordered_access_idx
[
j
]
==
ordered_access_lengths
[
j
]
-
1
;
move_on_dim
(
i
)
&=
ordered_dst_access_idx
[
j
]
==
ordered_dst_access_lengths
[
j
]
-
1
;
});
});
});
});
...
@@ -745,70 +784,126 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -745,70 +784,126 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__
static
constexpr
auto
GetSrcCoordinateResetStep
()
__device__
static
constexpr
auto
GetSrcCoordinateResetStep
()
{
{
constexpr
auto
src_scalar_per_access
=
[
&
]()
{
constexpr
auto
I0
=
Number
<
0
>
{};
Index
src_scalar_per_access
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
// scalar per access on each dim
if
constexpr
(
i
==
SrcVectorDim
)
// TODO: don't use lambda_scalar_per_access
{
constexpr
auto
src_scalar_per_access
=
generate_sequence
(
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
;
lambda_scalar_per_access
<
SrcVectorDim
,
SrcScalarPerVector
>
{},
Number
<
nDim
>
{});
}
else
{
src_scalar_per_access
(
i
)
=
1
;
}
});
return
src_scalar_per_access
;
constexpr
auto
src_access_lengths
=
SliceLengths
{}
/
src_scalar_per_access
;
}();
MultiIndex
<
nDim
>
src_reset_iterator
;
constexpr
auto
src_dim_access_order
=
SrcDimAccessOrder
{}
;
src_reset_iterator
(
Number
<
0
>
{})
=
src_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
constexpr
auto
ordered_src_access_lengths
=
container_reorder_given_new2old
(
src_access_lengths
,
src_dim_access_order
);
// judge move forward or move backward during the last iteration
constexpr
auto
forward_sweep
=
[
&
]()
{
StaticallyIndexedArray
<
bool
,
nDim
>
forward_sweep
;
forward_sweep
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{}
;
index_t
tmp
=
ordered_src_access_lengths
[
I0
]
-
1
;
src_reset_iterator
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
src_scalar_per_access
[
i_m1
])
==
0
)
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
?
0
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_lengths
[
j
]
-
1
;
:
(
src_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
});
return
src_reset_iterator
;
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
}
}
);
__device__
static
constexpr
auto
GetDstCoordinateResetStep
()
return
forward_sweep
;
{
}();
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
Index
dst_scalar_per_access
;
// calculate src data index after last iteration in RunRead(), if it has not being reset by
// RunRead()
constexpr
auto
src_data_idx
=
[
&
]()
{
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_src_access_lengths
[
i
]
-
1
:
0
;
{
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
;
}
else
{
dst_scalar_per_access
(
i
)
=
1
;
}
});
});
return
dst_scalar_per_access
;
auto
src_data_idx
=
container_reorder_given_old2new
(
ordered_idx
,
src_dim_access_order
)
*
src_scalar_per_access
;
return
src_data_idx
;
}();
}();
MultiIndex
<
nDim
>
dst_reset_iterator
;
//
constexpr
auto
reset_src_data_step
=
[
&
]()
{
Index
reset_src_data_step
;
dst_reset_iterator
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
reset_src_data_step
(
i
)
=
-
src_data_idx
[
i
];
});
return
reset_src_data_step
;
}();
return
reset_src_data_step
;
}
__device__
static
constexpr
auto
GetDstCoordinateResetStep
()
{
constexpr
auto
I0
=
Number
<
0
>
{};
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr
auto
dst_scalar_per_access
=
generate_sequence
(
lambda_scalar_per_access
<
DstVectorDim
,
DstScalarPerVector
>
{},
Number
<
nDim
>
{});
constexpr
auto
dst_access_lengths
=
SliceLengths
{}
/
dst_scalar_per_access
;
constexpr
auto
dst_dim_access_order
=
DstDimAccessOrder
{};
constexpr
auto
ordered_dst_access_lengths
=
container_reorder_given_new2old
(
dst_access_lengths
,
dst_dim_access_order
);
// judge move forward or move backward during the last iteration
constexpr
auto
forward_sweep
=
[
&
]()
{
StaticallyIndexedArray
<
bool
,
nDim
>
forward_sweep
;
forward_sweep
(
I0
)
=
true
;
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
index_t
tmp
=
ordered_dst_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_lengths
[
j
]
-
1
;
});
dst_reset_iterator
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
forward_sweep
(
i
)
=
tmp
%
2
==
0
;
?
0
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
});
return
dst_reset_iterator
;
return
forward_sweep
;
}();
// calculate dst data index after last iteration in RunWrite(), if it has not being reset by
// RunWrite()
constexpr
auto
dst_data_idx
=
[
&
]()
{
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
ordered_idx
(
i
)
=
forward_sweep
[
i
]
?
ordered_dst_access_lengths
[
i
]
-
1
:
0
;
});
auto
dst_data_idx
=
container_reorder_given_old2new
(
ordered_idx
,
dst_dim_access_order
)
*
dst_scalar_per_access
;
return
dst_data_idx
;
}();
//
constexpr
auto
reset_dst_data_step
=
[
&
]()
{
Index
reset_dst_data_step
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
reset_dst_data_step
(
i
)
=
-
dst_data_idx
[
i
];
});
return
reset_dst_data_step
;
}();
return
reset_dst_data_step
;
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment