Commit 033ea806 authored by root's avatar root
Browse files

add reduce_multi_d

parent 22ee67a9
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp) add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
add_example_executable(example_reduce_threadwise reduce_threadwise.cpp) add_example_executable(example_reduce_threadwise_multi_d reduce_threadwise_multi_d.cpp)
add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp) add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp)
add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp) add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include <getopt.h> #include <getopt.h>
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "reduce_threadwise_impl.hpp" #include "reduce_threadwise_multi_d_impl.hpp"
#include "reduce_example_common.hpp" #include "reduce_example_common.hpp"
using namespace ck; using namespace ck;
...@@ -25,7 +25,7 @@ class SimpleAppArgs ...@@ -25,7 +25,7 @@ class SimpleAppArgs
public: public:
std::vector<size_t> inLengths = {16, 64, 32, 16}; std::vector<size_t> inLengths = {16, 64, 32, 16};
std::vector<int> reduceDims = {0, 1, 2}; std::vector<int> reduceDims = {0};
std::vector<float> scales = {1.0f, 0.0f}; std::vector<float> scales = {1.0f, 0.0f};
bool do_verification = true; bool do_verification = true;
...@@ -118,13 +118,13 @@ template <typename InOutDataType, ...@@ -118,13 +118,13 @@ template <typename InOutDataType,
ReduceTensorOp ReduceOpId, ReduceTensorOp ReduceOpId,
index_t PropagateNan, index_t PropagateNan,
index_t OutputIndex> index_t OutputIndex>
bool reduce_threadwise_test(bool do_verification, bool reduce_threadwise_multi_d_test(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims, const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta) float beta)
{ {
bool matched = false; bool matched = false;
int result = 0; int result = 0;
...@@ -144,13 +144,13 @@ bool reduce_threadwise_test(bool do_verification, ...@@ -144,13 +144,13 @@ bool reduce_threadwise_test(bool do_verification,
ck::ranges::copy(reduceDims, arrReduceDims.begin()); ck::ranges::copy(reduceDims, arrReduceDims.begin());
result = reduce_threadwise_impl<InOutDataType, result = reduce_threadwise_multi_d_impl<InOutDataType,
AccDataType, AccDataType,
ReduceOpId, ReduceOpId,
ShapeType::Rank_, ShapeType::Rank_,
ShapeType::NumReduceDim_, ShapeType::NumReduceDim_,
PropagateNan, PropagateNan,
OutputIndex>( OutputIndex>(
do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta); do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
matched = true; matched = true;
...@@ -176,96 +176,53 @@ int main(int argc, char* argv[]) ...@@ -176,96 +176,53 @@ int main(int argc, char* argv[])
if(arg.data_type == 0) if(arg.data_type == 0)
{ {
pass = reduce_threadwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>( pass = reduce_threadwise_multi_d_test<ck::half_t,
arg.do_verification, float,
arg.init_method, ReduceOpId,
arg.time_kernel, PropagateNan,
arg.inLengths, OutputIndex>(arg.do_verification,
arg.reduceDims, arg.init_method,
arg.scales[0], arg.time_kernel,
arg.scales[1]); arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
} }
else if(arg.data_type == 1) else if(arg.data_type == 1)
{ {
pass = reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>( pass =
arg.do_verification, reduce_threadwise_multi_d_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
arg.init_method, arg.do_verification,
arg.time_kernel, arg.init_method,
arg.inLengths, arg.time_kernel,
arg.reduceDims, arg.inLengths,
arg.scales[0], arg.reduceDims,
arg.scales[1]); arg.scales[0],
arg.scales[1]);
} }
#if 0
else if(arg.data_type == 3)
{
pass = reduce_threadwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
else if(arg.data_type == 5)
{
pass = reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
else if(arg.data_type == 6)
{
pass = reduce_threadwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inLengths,
arg.reduceDims,
arg.scales[0],
arg.scales[1]);
}
#endif
} }
else else
{ {
// for testing half_t // for testing half_t
pass = pass && pass = pass && reduce_threadwise_multi_d_test<ck::half_t,
reduce_threadwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>( float,
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); ReduceOpId,
PropagateNan,
// for testing float OutputIndex>(
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
// for testing double
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
// for testing bhalf_t // for testing float
pass = pass && pass = pass &&
reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>( reduce_threadwise_multi_d_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
#if 0 // for testing bhalf_t
// for testing int8_t pass = pass && reduce_threadwise_multi_d_test<ck::bhalf_t,
pass = float,
pass && reduce_threadwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>( ReduceOpId,
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); PropagateNan,
OutputIndex>(
// for testing 3D input true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 960}, {0}, 1.0f, 0.0f);
// for testing 5D input
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
true, 2, true, {16, 64, 32, 2, 960}, {0}, 1.0f, 0.0f);
#endif
} }
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
//#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
...@@ -28,13 +27,13 @@ template <typename InOutDataType, ...@@ -28,13 +27,13 @@ template <typename InOutDataType,
ck::index_t NumReduceDim, ck::index_t NumReduceDim,
bool PropagateNan, bool PropagateNan,
bool OutputIndex> bool OutputIndex>
int reduce_threadwise_impl(bool do_verification, int reduce_threadwise_multi_d_impl(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::array<int, NumReduceDim>& reduceDims, const std::array<int, NumReduceDim>& reduceDims,
float alpha, float alpha,
float beta) float beta)
{ {
using namespace ck; using namespace ck;
...@@ -90,17 +89,17 @@ int reduce_threadwise_impl(bool do_verification, ...@@ -90,17 +89,17 @@ int reduce_threadwise_impl(bool do_verification,
}; };
using PassThrough = tensor_operation::element_wise::PassThrough; using PassThrough = tensor_operation::element_wise::PassThrough;
// using Add = tensor_operation::element_wise::Add; using Add = tensor_operation::element_wise::Add;
using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType; using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation = PassThrough; using InElementwiseOperation = PassThrough;
using OutElementwiseOperation = PassThrough; using OutElementwiseOperation = Add;
using InOutDataTypeInDevice = InOutDataType; using InOutDataTypeInDevice = InOutDataType;
using DeviceReduceInstance = using DeviceReduceInstance =
ck::tensor_operation::device::DeviceReduceThreadWiseMultiD<InOutDataTypeInDevice, ck::tensor_operation::device::DeviceReduceThreadWiseMultiD<InOutDataTypeInDevice,
ck::Tuple<>, ck::Tuple<InOutDataTypeInDevice>,
AccDataType, AccDataType,
InOutDataTypeInDevice, InOutDataTypeInDevice,
Rank, Rank,
...@@ -129,6 +128,9 @@ int reduce_threadwise_impl(bool do_verification, ...@@ -129,6 +128,9 @@ int reduce_threadwise_impl(bool do_verification,
Tensor<InOutDataType> out_ref(outLengths); Tensor<InOutDataType> out_ref(outLengths);
Tensor<InOutDataType> out(outLengths); Tensor<InOutDataType> out(outLengths);
Tensor<InOutDataType> d0(outLengths);
Tensor<int> out_indices_ref(outLengths); Tensor<int> out_indices_ref(outLengths);
Tensor<int> out_indices(outLengths); Tensor<int> out_indices(outLengths);
...@@ -147,16 +149,19 @@ int reduce_threadwise_impl(bool do_verification, ...@@ -147,16 +149,19 @@ int reduce_threadwise_impl(bool do_verification,
case 0: break; case 0: break;
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread); in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
d0.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
if(beta != 0.0f) if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread); out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
break; break;
case 2: case 2:
in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread); in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
d0.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
if(beta != 0.0f) if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread); out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread); in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
d0.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
if(beta != 0.0f) if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
num_thread); num_thread);
...@@ -169,13 +174,14 @@ int reduce_threadwise_impl(bool do_verification, ...@@ -169,13 +174,14 @@ int reduce_threadwise_impl(bool do_verification,
// these buffers are usually provided by the user application // these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
DeviceMem d0_dev(sizeof(InOutDataTypeInDevice) * d0.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.mData.data());
d0_dev.ToDevice(d0.mData.data());
if(beta != 0.0f) if(beta != 0.0f)
{ {
out_dev.ToDevice(out.mData.data()); out_dev.ToDevice(out.mData.data());
}; };
...@@ -188,11 +194,13 @@ int reduce_threadwise_impl(bool do_verification, ...@@ -188,11 +194,13 @@ int reduce_threadwise_impl(bool do_verification,
std::array<index_t, Rank> arrInLengths; std::array<index_t, Rank> arrInLengths;
std::array<index_t, Rank> arrInStrides; std::array<index_t, Rank> arrInStrides;
std::array<index_t, NumOutDim> arrOutLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::array<index_t, NumOutDim> arrOutStrides; std::array<index_t, NumOutDim> arrOutStrides;
ck::ranges::copy(inLengths, arrInLengths.begin()); ck::ranges::copy(inLengths, arrInLengths.begin());
ck::ranges::copy(inStrides, arrInStrides.begin()); ck::ranges::copy(inStrides, arrInStrides.begin());
ck::ranges::copy(outLengths, arrOutLengths.begin()); ck::ranges::copy(outLengths, arrOutLengths.begin());
ck::ranges::copy(outStrides, arrOutStrides.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin());
...@@ -236,19 +244,22 @@ int reduce_threadwise_impl(bool do_verification, ...@@ -236,19 +244,22 @@ int reduce_threadwise_impl(bool do_verification,
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer(); auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
invoker_ptr_ref->Run(argument_ptr_ref.get()); invoker_ptr_ref->Run(argument_ptr_ref.get());
for(std::size_t i = 0; i < out_ref.GetElementSize(); i++)
out_elementwise_op(out_ref.mData[i], out_ref.mData[i], d0.mData[i]);
}; };
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
arrInStrides, arrInStrides,
{}, {arrOutLengths},
{}, {arrOutStrides},
arrOutLengths, arrOutLengths,
arrOutStrides, arrOutStrides,
reduceDims, reduceDims,
in_dev.GetDeviceBuffer(), in_dev.GetDeviceBuffer(),
{}, {d0_dev.GetDeviceBuffer()},
out_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(),
in_elementwise_op, in_elementwise_op,
out_elementwise_op); out_elementwise_op);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment