Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
10947a54
Commit
10947a54
authored
Nov 01, 2022
by
Astha Rai
Browse files
added variables to distribute threads through both dimensions
parent
0bff049a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
26 deletions
+15
-26
include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
.../ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+6
-21
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
.../ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+9
-5
No files found.
include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
View file @
10947a54
...
@@ -115,7 +115,7 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
...
@@ -115,7 +115,7 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
desc
,
desc
,
make_tuple
(
make_merge_transform
(
mLengths
),
make_merge_transform
(
nLengths
)),
make_tuple
(
make_merge_transform
(
mLengths
),
make_merge_transform
(
nLengths
)),
make_tuple
(
mDimIds
,
nDimIds
),
make_tuple
(
mDimIds
,
nDimIds
),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
PadDescriptor_MN_2d
(
desc_mn
,
gridSize
,
blockSize
);
return
PadDescriptor_MN_2d
(
desc_mn
,
gridSize
,
blockSize
);
}
}
...
@@ -150,6 +150,8 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
...
@@ -150,6 +150,8 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
ElementwiseOperation
,
ElementwiseOperation
,
MPerThread
,
MPerThread
,
NPerThread
,
NPerThread
,
//num_threads_m,
//num_threads_n,
InScalarPerVectorSeq
,
InScalarPerVectorSeq
,
OutScalarPerVectorSeq
>
;
OutScalarPerVectorSeq
>
;
...
@@ -199,6 +201,9 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
...
@@ -199,6 +201,9 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
lengths
,
outStridesArray
[
I
.
value
],
gridSize_
,
blockSize_
);
lengths
,
outStridesArray
[
I
.
value
],
gridSize_
,
blockSize_
);
},
},
Number
<
NumOutput
>
{});
Number
<
NumOutput
>
{});
//num_threads_m = 1;
//num_threads_n = gridSize_ * blockSize_;
}
}
InDataTypePointerTuple
in_dev_buffers_
;
InDataTypePointerTuple
in_dev_buffers_
;
...
@@ -264,14 +269,6 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
...
@@ -264,14 +269,6 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
const
std
::
array
<
index_t
,
NumDim
>&
strides
,
const
std
::
array
<
index_t
,
NumDim
>&
strides
,
index_t
scalarPerVector
,
index_t
scalarPerVector
,
index_t
vectorDim
)
{
index_t
vectorDim
)
{
// std::cout << "scalarPerVector: " << scalarPerVector << std::endl;
// std::cout << "stride back: " << strides.back() << std::endl;
// std::cout << "len back: " << lengths.back() << std::endl;
// std::cout << "NumDim-1: " << NumDim - 1 << std::endl;
// std::cout << "stride[nd-1]: " << strides[NumDim - 1] << std::endl;
// std::cout << "NumDim_m-1: " << NumDim_m - 1 << std::endl;
// std::cout << std::endl;
// std::cout << "ISPVV Check 1 starting" << std::endl;
if
(
strides
[
vectorDim
]
==
1
&&
if
(
strides
[
vectorDim
]
==
1
&&
(
lengths
[
vectorDim
]
%
scalarPerVector
==
0
||
(
lengths
[
vectorDim
]
%
scalarPerVector
==
0
||
lengths
[
vectorDim
]
%
scalarPerVector
==
lengths
[
vectorDim
]))
lengths
[
vectorDim
]
%
scalarPerVector
==
lengths
[
vectorDim
]))
...
@@ -293,18 +290,6 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
...
@@ -293,18 +290,6 @@ struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
return
false
;
return
false
;
};
};
/**auto IsOutScalarPerVectorValid =
[&](const std::array<index_t, NumDim>& lengths,
const std::array<index_t, NumDim>& strides,
index_t scalarPerVector) {
std::cout << "ISPVV Check 1 starting" << std::endl;
if(strides.back() != 1 && lengths.back() % scalarPerVector == strides[NumDim - 1])
{
std::cout << "Check 1 passed " << std::endl;
return true;
}
std::cout << "Check 1 failed" << std::endl;
};**/
bool
valid
=
true
;
bool
valid
=
true
;
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
View file @
10947a54
...
@@ -104,18 +104,22 @@ struct GridwiseElementwise_2D
...
@@ -104,18 +104,22 @@ struct GridwiseElementwise_2D
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
const
index_t
totalNumThread
=
blockSize
*
blockPerGrid
;
const
index_t
totalNumThread
=
blockSize
*
blockPerGrid
;
const
index_t
num_threads_m
=
4
;
const
index_t
num_threads_n
=
totalNumThread
/
4
;
//static_assert(num_threads_m * num_threads_n == totalNumThread, "error: threads per dimension not equal to total threads");
const
auto
M
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I0
);
const
auto
M
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I0
);
const
auto
N
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I1
);
const
auto
N
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I1
);
const
index_t
loop_step_m
=
MPerThread
;
const
index_t
loop_step_m
=
num_threads_m
*
MPerThread
;
const
index_t
loop_step_n
=
totalNumT
hread
*
NPerThread
;
const
index_t
loop_step_n
=
num_t
hread
s_n
*
NPerThread
;
const
index_t
thread_1d_id
=
get_thread_global_1d_id
();
const
index_t
thread_1d_id
=
get_thread_global_1d_id
();
//
index_t tid_m = thread_1d_id /
(N / NPerT
hread
)
;
index_t
tid_m
=
thread_1d_id
/
num_t
hread
s_n
;
//
index_t tid_n = thread_1d_id %
(N / NPerT
hread
)
;
index_t
tid_n
=
thread_1d_id
%
num_t
hread
s_n
;
const
auto
thread_global_offset
=
make_multi_index
(
0
,
thread_1d_id
*
NPerThread
);
const
auto
thread_global_offset
=
make_multi_index
(
tid_m
*
MPerThread
,
tid_n
*
NPerThread
);
auto
in_global_load_tuple
=
generate_tuple
(
auto
in_global_load_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
[
&
](
auto
I
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment