Commit edb1d2c3 authored by Chao Liu's avatar Chao Liu
Browse files

fix forward_sweep bugs in threadwise copy

parent 4c4b7cb0
...@@ -326,7 +326,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 ...@@ -326,7 +326,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_lengths[I0] - 1; index_t tmp = ordered_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
}); });
...@@ -506,7 +506,7 @@ struct ThreadwiseTensorSliceTransfer_v2 ...@@ -506,7 +506,7 @@ struct ThreadwiseTensorSliceTransfer_v2
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_idx[I0]; index_t tmp = ordered_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
}); });
...@@ -638,7 +638,7 @@ struct ThreadwiseTensorSliceTransfer_v2 ...@@ -638,7 +638,7 @@ struct ThreadwiseTensorSliceTransfer_v2
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_lengths[I0] - 1; index_t tmp = ordered_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
}); });
...@@ -835,7 +835,7 @@ struct ThreadwiseTensorSliceTransfer_v3 ...@@ -835,7 +835,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_src_access_idx[I0]; index_t tmp = ordered_src_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
}); });
...@@ -992,7 +992,7 @@ struct ThreadwiseTensorSliceTransfer_v3 ...@@ -992,7 +992,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_idx[I0]; index_t tmp = ordered_dst_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
}); });
...@@ -1136,7 +1136,7 @@ struct ThreadwiseTensorSliceTransfer_v3 ...@@ -1136,7 +1136,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_src_access_lengths[I0] - 1; index_t tmp = ordered_src_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
}); });
...@@ -1196,7 +1196,7 @@ struct ThreadwiseTensorSliceTransfer_v3 ...@@ -1196,7 +1196,7 @@ struct ThreadwiseTensorSliceTransfer_v3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_lengths[I0] - 1; index_t tmp = ordered_dst_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
}); });
......
...@@ -233,7 +233,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4 ...@@ -233,7 +233,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_idx[I0]; index_t tmp = ordered_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
}); });
...@@ -463,7 +463,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4 ...@@ -463,7 +463,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_lengths[I0] - 1; index_t tmp = ordered_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
}); });
......
...@@ -187,7 +187,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5 ...@@ -187,7 +187,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_idx[I0]; index_t tmp = ordered_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
}); });
...@@ -396,7 +396,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5 ...@@ -396,7 +396,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_lengths[I0] - 1; index_t tmp = ordered_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
}); });
......
...@@ -415,8 +415,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -415,8 +415,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_idx[I0]; index_t tmp = ordered_dst_access_idx[I0];
// TODO: BUG: should start at 1 static_for<1, i, 1>{}([&](auto j) {
static_for<0, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
}); });
...@@ -561,11 +560,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -561,11 +560,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1
forward_sweep_(I0) = true; forward_sweep_(I0) = true;
// TODO: BUG: should start at 1
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_src_access_lengths[I0] - 1; index_t tmp = ordered_src_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
}); });
...@@ -625,7 +623,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -625,7 +623,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_lengths[I0] - 1; index_t tmp = ordered_dst_access_lengths[I0] - 1;
// TODO: BUG: should start at 1
static_for<1, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
}); });
......
...@@ -180,7 +180,6 @@ struct ThreadwiseTensorSliceTransfer_v3r3 ...@@ -180,7 +180,6 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_src_access_idx[I0]; index_t tmp = ordered_src_access_idx[I0];
// TODO: BUG: should start at 1
static_for<1, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
}); });
...@@ -494,8 +493,7 @@ struct ThreadwiseTensorSliceTransfer_v3r3 ...@@ -494,8 +493,7 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_idx[I0]; index_t tmp = ordered_dst_access_idx[I0];
// TODO: BUG: should start at 1 static_for<1, i, 1>{}([&](auto j) {
static_for<0, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
}); });
...@@ -615,7 +613,7 @@ struct ThreadwiseTensorSliceTransfer_v3r3 ...@@ -615,7 +613,7 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_src_access_lengths[I0] - 1; index_t tmp = ordered_src_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
}); });
...@@ -675,7 +673,6 @@ struct ThreadwiseTensorSliceTransfer_v3r3 ...@@ -675,7 +673,6 @@ struct ThreadwiseTensorSliceTransfer_v3r3
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_dst_access_lengths[I0] - 1; index_t tmp = ordered_dst_access_lengths[I0] - 1;
// TODO: BUG: should start at 1
static_for<1, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
}); });
......
...@@ -259,11 +259,10 @@ struct ThreadwiseTensorSliceTransfer_v6r1 ...@@ -259,11 +259,10 @@ struct ThreadwiseTensorSliceTransfer_v6r1
forward_sweep_(I0) = true; forward_sweep_(I0) = true;
// TODO: BUG: should start at 1
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_lengths[I0] - 1; index_t tmp = ordered_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
}); });
......
...@@ -344,11 +344,10 @@ struct ThreadwiseTensorSliceTransfer_v6r3 ...@@ -344,11 +344,10 @@ struct ThreadwiseTensorSliceTransfer_v6r3
forward_sweep_(I0) = true; forward_sweep_(I0) = true;
// TODO: BUG: should start at 1
static_for<1, nDim, 1>{}([&](auto i) { static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_lengths[I0] - 1; index_t tmp = ordered_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
}); });
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment