Commit d0b49a14 authored by Qianfeng Zhang's avatar Qianfeng Zhang
Browse files

Merge branch 'develop' into bnorm_bwd_pr

parents 29026b0e 87fd1152
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
using ADataType = ck::half_t; using ADataType = ck::half_t;
using BDataType = ck::half_t; using BDataType = ck::half_t;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
using ADataType = double; using ADataType = double;
using BDataType = double; using BDataType = double;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
using ADataType = ck::int4_t; using ADataType = ck::int4_t;
using BDataType = ck::int4_t; using BDataType = ck::int4_t;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
using ADataType = int8_t; using ADataType = int8_t;
using BDataType = int8_t; using BDataType = int8_t;
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp" #include "ck/utility/data_type.hpp"
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp" #include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include <vector> #include <vector>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
......
...@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification, ...@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
return; return;
std::array<int, ShapeType::NumReduceDim_> arrReduceDims;
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
result = reduce_blockwise_impl<InOutDataType, result = reduce_blockwise_impl<InOutDataType,
AccDataType, AccDataType,
ReduceOpId, ReduceOpId,
...@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification, ...@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
ShapeType::NumReduceDim_, ShapeType::NumReduceDim_,
PropagateNan, PropagateNan,
OutputIndex>( OutputIndex>(
do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
matched = true; matched = true;
}); });
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims, const std::array<int, NumReduceDim>& reduceDims,
float alpha, float alpha,
float beta) float beta)
...@@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification,
using namespace ck; using namespace ck;
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
constexpr bool op_support_indices = constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
...@@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification,
std::vector<size_t> outLengths; std::vector<size_t> outLengths;
std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims); auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
if(invariantDims.empty()) if(invariantDims.empty())
outLengths.push_back(1); outLengths.push_back(1);
...@@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; std::array<index_t, Rank> arrInLengths;
std::vector<ck::index_t> i_inStrides; std::array<index_t, Rank> arrInStrides;
std::vector<ck::index_t> i_outLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, NumOutDim> arrOutStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end()); std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin());
i_inStrides.assign(inStrides.begin(), inStrides.end()); std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin());
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
i_inStrides, arrInStrides,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -90,15 +90,15 @@ static bool time_kernel; ...@@ -90,15 +90,15 @@ static bool time_kernel;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// used by the device reduction // used by the device reduction
const std::vector<int> reduceDims_1 = {4}; const std::array<int, 1> reduceDims_1 = {4};
const std::vector<int> invariantDims_1 = {0, 1, 2, 3}; // const std::array<int, 4> invariantDims_1 = {0, 1, 2, 3};
const std::vector<int> reduceDims_2 = {3}; const std::array<int, 1> reduceDims_2 = {3};
const std::vector<int> invariantDims_2 = {0, 1, 2}; // const std::array<int, 3> invariantDims_2 = {0, 1, 2};
// used by the host reduction // used by the host reduction
const std::vector<int> reduceDims = {3, 4}; const std::array<int, 2> reduceDims = {3, 4};
const std::vector<int> invariantDims = {0, 1, 2}; const std::array<int, 3> invariantDims = {0, 1, 2};
const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128}; const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
...@@ -214,26 +214,26 @@ int main(int argc, char* argv[]) ...@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths_1; std::array<index_t, 5> arrInLengths_1;
std::vector<ck::index_t> i_inStrides_1; std::array<index_t, 5> arrInStrides_1;
std::vector<ck::index_t> i_inLengths_2; std::array<index_t, 4> arrInLengths_2;
std::vector<ck::index_t> i_inStrides_2; std::array<index_t, 4> arrInStrides_2;
std::vector<ck::index_t> i_outLengths; std::array<index_t, 3> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, 3> arrOutStrides;
i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end()); std::copy(inLengths_1.begin(), inLengths_1.end(), arrInLengths_1.begin());
i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end()); std::copy(inStrides_1.begin(), inStrides_1.end(), arrInStrides_1.begin());
i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end()); std::copy(inLengths_2.begin(), inLengths_2.end(), arrInLengths_2.begin());
i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end()); std::copy(inStrides_2.begin(), inStrides_2.end(), arrInStrides_2.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin());
auto reduce_1 = DeviceReduceInstance_1{}; auto reduce_1 = DeviceReduceInstance_1{};
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1, auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
i_inStrides_1, arrInStrides_1,
i_inLengths_2, arrInLengths_2,
i_inStrides_2, arrInStrides_2,
reduceDims_1, reduceDims_1,
1.0f, 1.0f,
0.0f, 0.0f,
...@@ -255,10 +255,10 @@ int main(int argc, char* argv[]) ...@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
auto reduce_2 = DeviceReduceInstance_2{}; auto reduce_2 = DeviceReduceInstance_2{};
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2, auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
i_inStrides_2, arrInStrides_2,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims_2, reduceDims_2,
alpha, alpha,
beta, beta,
......
...@@ -5,11 +5,10 @@ ...@@ -5,11 +5,10 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
template <ck::index_t Rank, ck::index_t NumReduceDim> template <int Rank, int NumReduceDim>
std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims) static inline std::array<int, Rank - NumReduceDim>
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
{ {
assert(NumReduceDim == reduceDims.size());
int reduceFlag = 0; int reduceFlag = 0;
// flag the bits for the reduceDims // flag the bits for the reduceDims
...@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims) ...@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
reduceFlag |= 1 << reduceDims[i]; reduceFlag |= 1 << reduceDims[i];
}; };
std::vector<int> invariantDims; std::array<int, Rank - NumReduceDim> invariantDims;
// collect invariant dimensions // collect invariant dimensions
int dim = 0;
for(int i = 0; i < Rank; i++) for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0) if((reduceFlag & (1 << i)) == 0)
{ {
invariantDims.push_back(i); invariantDims[dim] = i;
dim++;
}; };
return invariantDims; return invariantDims;
......
...@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification, ...@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
return; return;
std::array<int, ShapeType::NumReduceDim_> a_reduceDims;
std::copy(reduceDims.begin(), reduceDims.end(), a_reduceDims.begin());
result = reduce_multiblock_atomic_add_impl<InOutDataType, result = reduce_multiblock_atomic_add_impl<InOutDataType,
AccDataType, AccDataType,
ReduceOpId, ReduceOpId,
ShapeType::Rank_, ShapeType::Rank_,
ShapeType::NumReduceDim_, ShapeType::NumReduceDim_,
PropagateNan>( PropagateNan>(
do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); do_verification, init_method, time_kernel, inLengths, a_reduceDims, alpha, beta);
matched = true; matched = true;
}); });
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims, const std::array<int, NumReduceDim>& reduceDims,
float alpha, float alpha,
float beta) float beta)
...@@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
using namespace ck; using namespace ck;
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
constexpr bool op_support_atomic_add = constexpr bool op_support_atomic_add =
(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG); (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG);
...@@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
std::vector<size_t> outLengths; std::vector<size_t> outLengths;
std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims); auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
if(invariantDims.empty()) if(invariantDims.empty())
outLengths.push_back(1); outLengths.push_back(1);
...@@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; std::array<index_t, Rank> arrInLengths;
std::vector<ck::index_t> i_inStrides; std::array<index_t, Rank> arrInStrides;
std::vector<ck::index_t> i_outLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, NumOutDim> arrOutStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end()); std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin());
i_inStrides.assign(inStrides.begin(), inStrides.end()); std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin());
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
i_inStrides, arrInStrides,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment