Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
033ea806
Commit
033ea806
authored
Apr 12, 2024
by
root
Browse files
add reduce_multi_d
parent
22ee67a9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
77 additions
and
109 deletions
+77
-109
example/12_reduce/CMakeLists.txt
example/12_reduce/CMakeLists.txt
+1
-1
example/12_reduce/reduce_threadwise_multi_d.cpp
example/12_reduce/reduce_threadwise_multi_d.cpp
+50
-93
example/12_reduce/reduce_threadwise_multi_d_impl.hpp
example/12_reduce/reduce_threadwise_multi_d_impl.hpp
+26
-15
No files found.
example/12_reduce/CMakeLists.txt
View file @
033ea806
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
)
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
)
add_example_executable
(
example_reduce_threadwise reduce_threadwise.cpp
)
add_example_executable
(
example_reduce_threadwise
_multi_d
reduce_threadwise
_multi_d
.cpp
)
add_example_executable
(
example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp
)
add_example_executable
(
example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp
)
add_example_executable
(
example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp
)
add_example_executable
(
example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp
)
example/12_reduce/reduce_threadwise.cpp
→
example/12_reduce/reduce_threadwise
_multi_d
.cpp
View file @
033ea806
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
#include <getopt.h>
#include <getopt.h>
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "reduce_threadwise_impl.hpp"
#include "reduce_threadwise_
multi_d_
impl.hpp"
#include "reduce_example_common.hpp"
#include "reduce_example_common.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -25,7 +25,7 @@ class SimpleAppArgs
...
@@ -25,7 +25,7 @@ class SimpleAppArgs
public:
public:
std
::
vector
<
size_t
>
inLengths
=
{
16
,
64
,
32
,
16
};
std
::
vector
<
size_t
>
inLengths
=
{
16
,
64
,
32
,
16
};
std
::
vector
<
int
>
reduceDims
=
{
0
,
1
,
2
};
std
::
vector
<
int
>
reduceDims
=
{
0
};
std
::
vector
<
float
>
scales
=
{
1.0
f
,
0.0
f
};
std
::
vector
<
float
>
scales
=
{
1.0
f
,
0.0
f
};
bool
do_verification
=
true
;
bool
do_verification
=
true
;
...
@@ -118,7 +118,7 @@ template <typename InOutDataType,
...
@@ -118,7 +118,7 @@ template <typename InOutDataType,
ReduceTensorOp
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
index_t
PropagateNan
,
index_t
PropagateNan
,
index_t
OutputIndex
>
index_t
OutputIndex
>
bool
reduce_threadwise_test
(
bool
do_verification
,
bool
reduce_threadwise_
multi_d_
test
(
bool
do_verification
,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
...
@@ -144,7 +144,7 @@ bool reduce_threadwise_test(bool do_verification,
...
@@ -144,7 +144,7 @@ bool reduce_threadwise_test(bool do_verification,
ck
::
ranges
::
copy
(
reduceDims
,
arrReduceDims
.
begin
());
ck
::
ranges
::
copy
(
reduceDims
,
arrReduceDims
.
begin
());
result
=
reduce_threadwise_impl
<
InOutDataType
,
result
=
reduce_threadwise_
multi_d_
impl
<
InOutDataType
,
AccDataType
,
AccDataType
,
ReduceOpId
,
ReduceOpId
,
ShapeType
::
Rank_
,
ShapeType
::
Rank_
,
...
@@ -176,8 +176,11 @@ int main(int argc, char* argv[])
...
@@ -176,8 +176,11 @@ int main(int argc, char* argv[])
if
(
arg
.
data_type
==
0
)
if
(
arg
.
data_type
==
0
)
{
{
pass
=
reduce_threadwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
=
reduce_threadwise_multi_d_test
<
ck
::
half_t
,
arg
.
do_verification
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
time_kernel
,
arg
.
inLengths
,
arg
.
inLengths
,
...
@@ -187,41 +190,8 @@ int main(int argc, char* argv[])
...
@@ -187,41 +190,8 @@ int main(int argc, char* argv[])
}
}
else
if
(
arg
.
data_type
==
1
)
else
if
(
arg
.
data_type
==
1
)
{
{
pass
=
reduce_threadwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
=
arg
.
do_verification
,
reduce_threadwise_multi_d_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inLengths
,
arg
.
reduceDims
,
arg
.
scales
[
0
],
arg
.
scales
[
1
]);
}
#if 0
else if(arg.data_type == 3)
{
pass = reduce_threadwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
else if(arg.data_type == 5)
{
pass = reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
else if(arg.data_type == 6)
{
pass = reduce_threadwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
arg
.
do_verification
,
arg
.
do_verification
,
arg
.
init_method
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
time_kernel
,
...
@@ -230,42 +200,29 @@ int main(int argc, char* argv[])
...
@@ -230,42 +200,29 @@ int main(int argc, char* argv[])
arg
.
scales
[
0
],
arg
.
scales
[
0
],
arg
.
scales
[
1
]);
arg
.
scales
[
1
]);
}
}
#endif
}
}
else
else
{
{
// for testing half_t
// for testing half_t
pass
=
pass
&&
pass
=
pass
&&
reduce_threadwise_multi_d_test
<
ck
::
half_t
,
reduce_threadwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
// for testing float
// for testing float
pass
=
pass
&&
reduce_threadwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
// for testing double
pass
=
pass
&&
reduce_threadwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
// for testing bhalf_t
pass
=
pass
&&
pass
=
pass
&&
reduce_threadwise_
test
<
ck
::
bhalf_
t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
reduce_threadwise_
multi_d_test
<
floa
t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
#if 0
// for testing bhalf_t
// for testing int8_t
pass
=
pass
&&
reduce_threadwise_multi_d_test
<
ck
::
bhalf_t
,
pass =
float
,
pass && reduce_threadwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
},
1.0
f
,
0.0
f
);
// for testing 3D input
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 960}, {0}, 1.0f, 0.0f);
// for testing 5D input
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 2, 960}, {0}, 1.0f, 0.0f);
#endif
}
}
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
...
...
example/12_reduce/reduce_threadwise_impl.hpp
→
example/12_reduce/reduce_threadwise_
multi_d_
impl.hpp
View file @
033ea806
...
@@ -8,7 +8,6 @@
...
@@ -8,7 +8,6 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
//#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
...
@@ -28,7 +27,7 @@ template <typename InOutDataType,
...
@@ -28,7 +27,7 @@ template <typename InOutDataType,
ck
::
index_t
NumReduceDim
,
ck
::
index_t
NumReduceDim
,
bool
PropagateNan
,
bool
PropagateNan
,
bool
OutputIndex
>
bool
OutputIndex
>
int
reduce_threadwise_impl
(
bool
do_verification
,
int
reduce_threadwise_
multi_d_
impl
(
bool
do_verification
,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
...
@@ -90,17 +89,17 @@ int reduce_threadwise_impl(bool do_verification,
...
@@ -90,17 +89,17 @@ int reduce_threadwise_impl(bool do_verification,
};
};
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
//
using Add = tensor_operation::element_wise::Add;
using
Add
=
tensor_operation
::
element_wise
::
Add
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
PassThrough
;
using
InElementwiseOperation
=
PassThrough
;
using
OutElementwiseOperation
=
PassThrough
;
using
OutElementwiseOperation
=
Add
;
using
InOutDataTypeInDevice
=
InOutDataType
;
using
InOutDataTypeInDevice
=
InOutDataType
;
using
DeviceReduceInstance
=
using
DeviceReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceReduceThreadWiseMultiD
<
InOutDataTypeInDevice
,
ck
::
tensor_operation
::
device
::
DeviceReduceThreadWiseMultiD
<
InOutDataTypeInDevice
,
ck
::
Tuple
<>
,
ck
::
Tuple
<
InOutDataTypeInDevice
>
,
AccDataType
,
AccDataType
,
InOutDataTypeInDevice
,
InOutDataTypeInDevice
,
Rank
,
Rank
,
...
@@ -129,6 +128,9 @@ int reduce_threadwise_impl(bool do_verification,
...
@@ -129,6 +128,9 @@ int reduce_threadwise_impl(bool do_verification,
Tensor
<
InOutDataType
>
out_ref
(
outLengths
);
Tensor
<
InOutDataType
>
out_ref
(
outLengths
);
Tensor
<
InOutDataType
>
out
(
outLengths
);
Tensor
<
InOutDataType
>
out
(
outLengths
);
Tensor
<
InOutDataType
>
d0
(
outLengths
);
Tensor
<
int
>
out_indices_ref
(
outLengths
);
Tensor
<
int
>
out_indices_ref
(
outLengths
);
Tensor
<
int
>
out_indices
(
outLengths
);
Tensor
<
int
>
out_indices
(
outLengths
);
...
@@ -147,16 +149,19 @@ int reduce_threadwise_impl(bool do_verification,
...
@@ -147,16 +149,19 @@ int reduce_threadwise_impl(bool do_verification,
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
d0
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
out_ref
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
break
;
break
;
case
2
:
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
d0
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
break
;
break
;
default:
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
d0
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
out_ref
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
num_thread
);
...
@@ -169,13 +174,14 @@ int reduce_threadwise_impl(bool do_verification,
...
@@ -169,13 +174,14 @@ int reduce_threadwise_impl(bool do_verification,
// these buffers are usually provided by the user application
// these buffers are usually provided by the user application
DeviceMem
in_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
in_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d0_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
d0
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
out
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
out
.
mDesc
.
GetElementSpaceSize
());
in_dev
.
ToDevice
(
in
.
mData
.
data
());
in_dev
.
ToDevice
(
in
.
mData
.
data
());
d0_dev
.
ToDevice
(
d0
.
mData
.
data
());
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
{
{
out_dev
.
ToDevice
(
out
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
mData
.
data
());
};
};
...
@@ -188,11 +194,13 @@ int reduce_threadwise_impl(bool do_verification,
...
@@ -188,11 +194,13 @@ int reduce_threadwise_impl(bool do_verification,
std
::
array
<
index_t
,
Rank
>
arrInLengths
;
std
::
array
<
index_t
,
Rank
>
arrInLengths
;
std
::
array
<
index_t
,
Rank
>
arrInStrides
;
std
::
array
<
index_t
,
Rank
>
arrInStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrOutLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrOutLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrOutStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrOutStrides
;
ck
::
ranges
::
copy
(
inLengths
,
arrInLengths
.
begin
());
ck
::
ranges
::
copy
(
inLengths
,
arrInLengths
.
begin
());
ck
::
ranges
::
copy
(
inStrides
,
arrInStrides
.
begin
());
ck
::
ranges
::
copy
(
inStrides
,
arrInStrides
.
begin
());
ck
::
ranges
::
copy
(
outLengths
,
arrOutLengths
.
begin
());
ck
::
ranges
::
copy
(
outLengths
,
arrOutLengths
.
begin
());
ck
::
ranges
::
copy
(
outStrides
,
arrOutStrides
.
begin
());
ck
::
ranges
::
copy
(
outStrides
,
arrOutStrides
.
begin
());
...
@@ -236,19 +244,22 @@ int reduce_threadwise_impl(bool do_verification,
...
@@ -236,19 +244,22 @@ int reduce_threadwise_impl(bool do_verification,
auto
invoker_ptr_ref
=
reduce_ref
.
MakeInvokerPointer
();
auto
invoker_ptr_ref
=
reduce_ref
.
MakeInvokerPointer
();
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
for
(
std
::
size_t
i
=
0
;
i
<
out_ref
.
GetElementSize
();
i
++
)
out_elementwise_op
(
out_ref
.
mData
[
i
],
out_ref
.
mData
[
i
],
d0
.
mData
[
i
]);
};
};
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrInLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrInLengths
,
arrInStrides
,
arrInStrides
,
{},
{
arrOutLengths
},
{},
{
arrOutStrides
},
arrOutLengths
,
arrOutLengths
,
arrOutStrides
,
arrOutStrides
,
reduceDims
,
reduceDims
,
in_dev
.
GetDeviceBuffer
(),
in_dev
.
GetDeviceBuffer
(),
{},
{
d0_dev
.
GetDeviceBuffer
()
},
out_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
in_elementwise_op
,
in_elementwise_op
,
out_elementwise_op
);
out_elementwise_op
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment