reduce.cuh 757 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__

#include <cub/block/block_reduce.cuh>

namespace op::common_cuda::reduce_op {

template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
    Tcompute ss = 0;

    // Each thread computes its partial sum
    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
        ss += Tcompute(data_ptr[i] * data_ptr[i]);
    }

    // Use CUB block-level reduction
    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
    __shared__ typename BlockReduce::TempStorage temp_storage;

    return BlockReduce(temp_storage).Sum(ss);
}

} // namespace op::common_cuda::reduce_op

#endif