Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
08848bb6
"tests/pipelines/vscode:/vscode.git/clone" did not exist on "f45c675d2cf1e24d8bee024658f112d4c86aa784"
Commit
08848bb6
authored
Oct 04, 2022
by
Astha Rai
Browse files
fixed 2d thread indexing
parent
5f01c06f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
13 deletions
+29
-13
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
.../ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+29
-13
No files found.
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
View file @
08848bb6
...
@@ -106,15 +106,8 @@ struct GridwiseElementwise_2D
...
@@ -106,15 +106,8 @@ struct GridwiseElementwise_2D
const
index_t
blockPerGrid_n
=
gridDim
.
y
;
const
index_t
blockPerGrid_n
=
gridDim
.
y
;
const
index_t
block_1d
=
get_block_1d_id
();
const
index_t
block_1d
=
get_block_1d_id
();
const
auto
M
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I0
);
const
auto
M
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I0
);
const
auto
N
=
in_grid_2d_desc_tuple
[
I1
].
GetLength
(
I1
);
const
auto
N
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I1
);
const
auto
M0
=
math
::
integer_divide_ceil
(
M
,
MPerBlock
);
//define MPerBlock and NPerBlock
const
auto
N0
=
math
::
integer_divide_ceil
(
N
,
NPerBlock
);
block_1d
=
block_1d
%
(
M0
*
N0
);
// swallow batch index
index_t
idx_N0
=
block_1d
%
N0
;
index_t
idx_M0
=
block_1d
/
N0
;
const
index_t
loop_step_m
=
blockPerGrid_m
*
blockSize
*
MPerThread
;
const
index_t
loop_step_m
=
blockPerGrid_m
*
blockSize
*
MPerThread
;
const
index_t
loop_step_n
=
blockPerGrid_n
*
blockSize
*
NPerThread
;
const
index_t
loop_step_n
=
blockPerGrid_n
*
blockSize
*
NPerThread
;
...
@@ -122,10 +115,33 @@ struct GridwiseElementwise_2D
...
@@ -122,10 +115,33 @@ struct GridwiseElementwise_2D
// const auto thread_global_id_2d =
// const auto thread_global_id_2d =
// thread_buffer_desc_mn.CalculateBottomIndex(make_multi_index(block_1d));
// thread_buffer_desc_mn.CalculateBottomIndex(make_multi_index(block_1d));
const
auto
blockId_m
=
thread_global_id_2d
[
I0
];
const
auto
blockId_n
=
thread_global_id_2d
[
I1
];
auto
thread_1d_id
=
get_thread_local_1d_id
();
const
auto
thread_global_offset
=
index_t
M01_
=
8
;
make_multi_index
(
thread_global_id_2d
*
MPerThread
,
thread_global_id_2d
*
NPerThread
);
const
auto
M0
=
math
::
integer_divide_ceil
(
M
,
MPerThread
);
const
auto
N0
=
math
::
integer_divide_ceil
(
N
,
NPerThread
);
thread_1d_id
=
thread_1d_id
%
(
M0
*
N0
);
// swallow batch index
index_t
idx_N0
=
thread_1d_id
%
N0
;
index_t
idx_M0
=
thread_1d_id
/
N0
;
// const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
// index_t idx_M00 = idx_M0 / M01_;
// index_t idx_M01 = idx_M0 % M01_;
// index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
// const auto thread_global_id_2d =make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
// idx_N0_M01_local /
//M01_adapt);
index_t
tid_m
=
get_thread_global_1d_id
();
index_t
tid_n
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
auto
thread_global_offset
=
make_multi_index
(
tid_m
*
MPerThread
,
tid_n
*
NPerThread
);
// make_multi_index(thread_global_id_2d[I0] * MPerThread, thread_global_id_2d[I1] *
// NPerThread);
auto
in_global_load_tuple
=
generate_tuple
(
auto
in_global_load_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
[
&
](
auto
I
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment