Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
41cdde99
Commit
41cdde99
authored
Aug 06, 2019
by
Chao Liu
Browse files
add looping Orders into ford and static_ford
parent
0271338e
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
87 additions
and
72 deletions
+87
-72
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+19
-19
composable_kernel/include/utility/functional3.hpp
composable_kernel/include/utility/functional3.hpp
+68
-53
No files found.
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
41cdde99
...
...
@@ -199,7 +199,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
}
__device__
void
RunLoadRegisterBuffer
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_
B
uffer
)
const
Float
*
__restrict__
p_
b
uffer
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
...
...
@@ -216,24 +216,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
auto
B
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
b
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
constexpr
index_t
B
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
B
uffer_data_multi_id_begin
);
constexpr
index_t
b
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
b
uffer_data_multi_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
const
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
B
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
b
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
const
index_t
B
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
B
uffer_data_multi_id_begin
);
const
index_t
b
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
b
uffer_data_multi_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
...
...
@@ -247,7 +247,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
p_src
+
src_offset
+
mThreadSrcOffset
,
make_zero_array
<
index_t
,
nDim
>
(),
thread_tensor_desc
,
p_
B
uffer
+
B
uffer_offset
,
p_
b
uffer
+
b
uffer_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
thread_sub_tensor_lengths
,
SrcAccessOrder
{},
...
...
@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
__device__
void
RunStoreRegisterBuffer
(
const
Float
*
__restrict__
p_
B
uffer
,
__device__
void
RunStoreRegisterBuffer
(
const
Float
*
__restrict__
p_
b
uffer
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
...
...
@@ -270,23 +270,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
constexpr
auto
B
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
b
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
index_t
B
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
B
uffer_data_multi_id_begin
);
constexpr
index_t
b
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
b
uffer_data_multi_id_begin
);
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
const
auto
B
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
b
uffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
index_t
B
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
B
uffer_data_multi_id_begin
);
const
index_t
b
uffer_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
b
uffer_data_multi_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#endif
...
...
@@ -299,7 +299,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1
(
thread_tensor_desc
,
p_
B
uffer
+
B
uffer_offset
,
p_
b
uffer
+
b
uffer_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
DstDesc
{},
p_dst
+
dst_offset
+
mThreadDstOffset
,
...
...
@@ -312,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
Float
p_
B
uffer
[
GetRegisterBufferSize
()];
Float
p_
b
uffer
[
GetRegisterBufferSize
()];
RunLoadRegisterBuffer
(
p_src
,
p_
B
uffer
);
RunStoreRegisterBuffer
(
p_
B
uffer
,
p_dst
);
RunLoadRegisterBuffer
(
p_src
,
p_
b
uffer
);
RunStoreRegisterBuffer
(
p_
b
uffer
,
p_dst
);
}
// When moving the slicing windows along a merged dimension, if the strides of the
...
...
composable_kernel/include/utility/functional3.hpp
View file @
41cdde99
...
...
@@ -24,105 +24,120 @@ struct is_static<Sequence<Is...>> : integral_constant<bool, true>
};
// RemainLengths: Sequence<...>
template
<
class
RemainLengths
>
// Orders: Sequence<...>
template
<
class
RemainLengths
,
class
Orders
>
struct
static_ford_impl
{
// F signature: F(Sequence<...> multi_id)
// CurrentMultiIndex: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
__host__
__device__
constexpr
static_ford_impl
()
{
static_assert
(
RemainLengths
::
GetSize
()
>
0
,
"wrong! should not get here"
);
}
// F signature: F(Sequence<...>)
// CurrentOrderedId: Sequence<...>
template
<
class
F
,
class
CurrentOrderedId
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentOrderedId
)
const
{
static_for
<
0
,
RemainLengths
::
Front
(),
1
>
{}([
=
](
auto
I
)
{
static_ford_impl
<
decltype
(
RemainLengths
::
PopFront
())
>
{}(
f
,
CurrentMultiIndex
::
PushBack
(
I
));
static_ford_impl
<
decltype
(
RemainLengths
::
PopFront
())
,
Orders
>
{}(
f
,
CurrentOrderedId
::
PushBack
(
I
));
});
}
};
template
<
>
struct
static_ford_impl
<
Sequence
<>>
template
<
class
Orders
>
struct
static_ford_impl
<
Sequence
<>
,
Orders
>
{
// F signature: F(Sequence<...>
multi_id
)
//
CurrentMultiIndex
: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
// F signature: F(Sequence<...>)
//
OrderedId
: Sequence<...>
template
<
class
F
,
class
OrderedId
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
OrderedId
)
const
{
f
(
CurrentMultiIndex
{});
// retrive unordered Id
f
(
OrderedId
::
ReorderGivenOld2New
(
Orders
{}));
}
};
// Lengths is Sequence<...>
template
<
class
Lengths
>
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
// dimension
template
<
class
Lengths
,
class
Orders
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>
::
type
>
struct
static_ford
{
__host__
__device__
constexpr
static_ford
()
{
static_assert
(
Lengths
::
GetSize
()
>
0
,
"wrong! Lengths is empty"
);
static_assert
(
Lengths
::
GetSize
()
==
Orders
::
GetSize
(),
"wrong! inconsistent size"
);
}
// F signature: F(Sequence<...> multi_id)
// multi_id is the unordered multi-index
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_assert
(
Lengths
::
GetSize
()
>
0
,
"wrong! Lengths is empty"
);
static_ford_impl
<
Lengths
>
{}(
f
,
Sequence
<>
{});
constexpr
auto
ordered_lengths
=
Lengths
::
ReorderGivenNew2Old
(
Orders
{});
static_ford_impl
<
decltype
(
ordered_lengths
),
Orders
>
{}(
f
,
Sequence
<>
{});
}
};
template
<
index_t
RemainDim
>
// RemainLengths: Sequence<...>
// Orders: Sequence<...>
template
<
class
RemainLengths
,
class
Orders
>
struct
ford_impl
{
// F signature: F(Array<...> multi_id)
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
,
class
RemainLengths
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
current_multi_id
,
RemainLengths
)
const
__host__
__device__
constexpr
ford_impl
()
{
static_assert
(
RemainLengths
::
GetSize
()
==
RemainDim
,
"wrong!"
);
static_assert
(
RemainDim
>
1
,
"wrong!"
);
constexpr
auto
next_length
=
RemainLengths
{}.
Front
();
static_assert
(
RemainLengths
::
GetSize
()
>
0
,
"wrong! should not get here"
);
}
for
(
index_t
i
=
0
;
i
<
next_length
;
++
i
)
// F signature: F(Array<...> multi_id)
// CurrentOrderdId: Array<...>
template
<
class
F
,
class
CurrentOrderedId
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentOrderedId
current_ordered_id
)
const
{
for
(
index_t
i
=
0
;
i
<
RemainLengths
::
Front
();
++
i
)
{
ford_impl
<
RemainDim
-
1
>
{}(
f
,
current_multi_id
.
PushBack
(
i
),
RemainLengths
{}.
PopFront
());
ford_impl
<
decltype
(
RemainLengths
::
PopFront
()),
Orders
>
{}(
f
,
current_ordered_id
.
PushBack
(
i
));
}
}
};
template
<
>
struct
ford_impl
<
1
>
template
<
class
Orders
>
struct
ford_impl
<
Sequence
<>
,
Orders
>
{
// F signature: F(Array<...> multi_id)
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
,
class
RemainLengths
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
current_multi_id
,
RemainLengths
)
const
// CurrentOrderdId: Array<...>
template
<
class
F
,
class
CurrentOrderedId
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentOrderedId
current_ordered_id
)
const
{
static_assert
(
RemainLengths
::
GetSize
()
==
1
,
"wrong!"
);
constexpr
index_t
last_length
=
RemainLengths
{}.
Front
();
for
(
index_t
i
=
0
;
i
<
last_length
;
++
i
)
{
f
(
current_multi_id
.
PushBack
(
i
));
}
// retrive unordered Id
f
(
reorder_array_given_old2new
(
current_ordered_id
,
Orders
{}));
}
};
// Lengths is Sequence<...>
template
<
class
Lengths
>
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
// dimension
template
<
class
Lengths
,
class
Orders
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>
::
type
>
struct
ford
{
__host__
__device__
constexpr
ford
()
{
static_assert
(
Lengths
::
GetSize
()
>
0
,
"wrong! Lengths is empty"
);
static_assert
(
Lengths
::
GetSize
()
==
Orders
::
GetSize
(),
"wrong! inconsistent size"
);
}
// F signature: F(Array<...> multi_id)
// multi_id is the unordered multi-index
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
constexpr
index_t
first_length
=
Lengths
{}.
Front
();
for
(
index_t
i
=
0
;
i
<
first_length
;
++
i
)
for
(
index_t
i
=
0
;
i
<
Lengths
::
Front
();
++
i
)
{
ford_impl
<
Lengths
::
GetSize
()
-
1
>
{}(
f
,
Array
<
index_t
,
1
>
{
i
}
,
Lengths
{}.
PopFront
()
);
ford_impl
<
decltype
(
Lengths
::
PopFront
()),
Orders
>
{}(
f
,
Array
<
index_t
,
1
>
{
i
});
}
}
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment