Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
edb1d2c3
Commit
edb1d2c3
authored
Dec 19, 2021
by
Chao Liu
Browse files
fix forward_sweep bugs in threadwise copy
parent
4c4b7cb0
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
17 additions
and
25 deletions
+17
-25
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
...ude/tensor_operation/threadwise_tensor_slice_transfer.hpp
+7
-7
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r4.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v1r4.hpp
+2
-2
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r5.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v1r5.hpp
+2
-2
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp
+2
-5
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r3.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v3r3.hpp
+2
-5
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp
+1
-2
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp
+1
-2
No files found.
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
View file @
edb1d2c3
...
...
@@ -326,7 +326,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_lengths
[
j
]
-
1
;
});
...
...
@@ -506,7 +506,7 @@ struct ThreadwiseTensorSliceTransfer_v2
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_idx
[
j
];
});
...
...
@@ -638,7 +638,7 @@ struct ThreadwiseTensorSliceTransfer_v2
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_lengths
[
j
]
-
1
;
});
...
...
@@ -835,7 +835,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_src_access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_idx
[
j
];
});
...
...
@@ -992,7 +992,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_idx
[
j
];
});
...
...
@@ -1136,7 +1136,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_src_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_lengths
[
j
]
-
1
;
});
...
...
@@ -1196,7 +1196,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_lengths
[
j
]
-
1
;
});
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r4.hpp
View file @
edb1d2c3
...
...
@@ -233,7 +233,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_idx
[
j
];
});
...
...
@@ -463,7 +463,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_lengths
[
j
]
-
1
;
});
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v1r5.hpp
View file @
edb1d2c3
...
...
@@ -187,7 +187,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_idx
[
I0
];
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_idx
[
j
];
});
...
...
@@ -396,7 +396,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_lengths
[
j
]
-
1
;
});
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
edb1d2c3
...
...
@@ -415,8 +415,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_idx
[
I0
];
// TODO: BUG: should start at 1
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_idx
[
j
];
});
...
...
@@ -561,11 +560,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_
(
I0
)
=
true
;
// TODO: BUG: should start at 1
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_src_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_lengths
[
j
]
-
1
;
});
...
...
@@ -625,7 +623,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_lengths
[
I0
]
-
1
;
// TODO: BUG: should start at 1
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_lengths
[
j
]
-
1
;
});
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r3.hpp
View file @
edb1d2c3
...
...
@@ -180,7 +180,6 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_src_access_idx
[
I0
];
// TODO: BUG: should start at 1
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_idx
[
j
];
});
...
...
@@ -494,8 +493,7 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_idx
[
I0
];
// TODO: BUG: should start at 1
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_idx
[
j
];
});
...
...
@@ -615,7 +613,7 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_src_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_lengths
[
j
]
-
1
;
});
...
...
@@ -675,7 +673,6 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_dst_access_lengths
[
I0
]
-
1
;
// TODO: BUG: should start at 1
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_lengths
[
j
]
-
1
;
});
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp
View file @
edb1d2c3
...
...
@@ -259,11 +259,10 @@ struct ThreadwiseTensorSliceTransfer_v6r1
forward_sweep_
(
I0
)
=
true
;
// TODO: BUG: should start at 1
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_lengths
[
j
]
-
1
;
});
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp
View file @
edb1d2c3
...
...
@@ -344,11 +344,10 @@ struct ThreadwiseTensorSliceTransfer_v6r3
forward_sweep_
(
I0
)
=
true
;
// TODO: BUG: should start at 1
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
index_t
tmp
=
ordered_access_lengths
[
I0
]
-
1
;
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_access_lengths
[
j
]
+
ordered_access_lengths
[
j
]
-
1
;
});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment