Commit 1795b38a authored by wooway777's avatar wooway777
Browse files

Revert "Merge pull request #1064 from InfiniTensor/issue/1031_T1-1-4"

This reverts commit a8ea2306, reversing
changes made to 7f295448.
parent 6ab911c3
#ifndef __ALL_MOORE_H__
#define __ALL_MOORE_H__
#include "../all_desc.h"
DESCRIPTOR(moore);
#endif
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "all_moore.h"
namespace op::all::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
musaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_musa = permuted_input_shape_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_MOORE(musaMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_MOORE(musaFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "all_nvidia.cuh"
namespace op::all::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
cudaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_cuda = permuted_input_shape_cuda + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_cuda = permuted_input_strides_cuda + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_CUDA(cudaMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_cuda, permuted_input_strides_cuda);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_CUDA(cudaFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_cuda, output_shape_cuda, permuted_input_strides_cuda, output_strides_cuda);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::nvidia
#ifndef __ALL_NVIDIA_H__
#define __ALL_NVIDIA_H__
#include "../all_desc.h"
DESCRIPTOR(nvidia);
#endif // __ALL_CUDA_API_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/all.h"
#include <vector>
#ifdef ENABLE_CPU_API
#include "cpu/all_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/all_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/all_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/all_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/all_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateAllDescriptor(
infiniopHandle_t handle,
infiniopAllDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::all::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::all::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
input_desc, \
dim, \
dim_size, \
keepdim)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetAllWorkspaceSize(infiniopAllDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::all::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopAll(
infiniopAllDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, input, dim, dim_size, keepdim, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroyAllDescriptor(infiniopAllDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "sum_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
namespace op::sum::cpu {
Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <typename T>
infiniStatus_t calculateSum(
const SumInfo *info,
T *output,
const T *input) {
if (info->reduce_dim_size == info->permuted_input_shape.size()) { // 规约到标量
float tempSum = 0.;
for (size_t index = 0; index < info->input_size; index++) {
size_t input_offset = op::common_cpu::indexToOffset(index, info->permuted_input_shape.size(), info->permuted_input_shape.data(), info->permuted_input_strides.data());
tempSum += utils::cast<float>(input[input_offset]);
}
output[0] = utils::cast<T>(tempSum);
return INFINI_STATUS_SUCCESS;
} else {
for (size_t i = 0; i < info->output_size; i++) {
size_t output_offset = op::common_cpu::indexToOffset(i, info->output_shape.size(), info->output_shape.data(), info->output_strides.data());
float tempSum = 0.;
for (size_t j = 0; j < info->reduce_num; j++) {
size_t input_offset = op::common_cpu::indexToOffset(j + i * info->reduce_num, info->permuted_input_shape.size(), info->permuted_input_shape.data(), info->permuted_input_strides.data());
tempSum += utils::cast<float>(input[input_offset]);
}
output[output_offset] = utils::cast<T>(tempSum);
}
return INFINI_STATUS_SUCCESS;
}
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return calculateSum<fp16_t>(&_info, (fp16_t *)output, reinterpret_cast<const fp16_t *>(input));
case INFINI_DTYPE_F32:
return calculateSum<float>(&_info, (float *)output, reinterpret_cast<const float *>(input));
case INFINI_DTYPE_BF16:
return calculateSum<bf16_t>(&_info, (bf16_t *)output, reinterpret_cast<const bf16_t *>(input));
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sum::cpu
#ifndef __INFINIOP_SUM_CPU_H__
#define __INFINIOP_SUM_CPU_H__
#include "../sum_desc.h"
DESCRIPTOR(cpu);
#endif // __INFINIOP_SUM_CPU_H__
#ifndef __SUM_CUDA_H__
#define __SUM_CUDA_H__
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
template <size_t BLOCK_SIZE, typename Tdata, typename Tcompute>
__global__ void sumAllKernel(
Tcompute *output,
const Tdata *input,
size_t input_size,
size_t permuted_input_shape_size,
size_t *permuted_input_shape,
ptrdiff_t *permuted_input_strides) {
__shared__ Tcompute s_data[BLOCK_SIZE];
size_t tid = threadIdx.x;
size_t idx = tid + blockIdx.x * blockDim.x;
if (idx < input_size) {
size_t input_offset = indexToOffset(idx, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
s_data[tid] = static_cast<Tcompute>(input[input_offset]);
} else {
s_data[tid] = static_cast<Tcompute>(0.f);
}
__syncthreads();
for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
s_data[tid] += s_data[tid + s];
}
__syncthreads();
}
if (tid == 0) {
atomicAdd(output, s_data[0]);
}
}
template <size_t BLOCK_SIZE, typename T>
__global__ void sumKernel(
T *output,
const T *input,
size_t permuted_input_shape_size,
size_t output_shape_size,
size_t output_size,
size_t reduce_num,
size_t *permuted_input_shape,
size_t *output_shape,
ptrdiff_t *permuted_input_strides,
ptrdiff_t *output_strides) {
size_t tid = threadIdx.x;
size_t idx = tid + blockIdx.x * blockDim.x;
if (idx >= output_size) {
return;
}
size_t output_index = indexToOffset(idx, output_shape_size, output_shape, output_strides);
float tempSum = static_cast<float>(0.f);
for (size_t i = 0; i < reduce_num; i++) {
size_t input_offset = indexToOffset(i + idx * reduce_num, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
tempSum += static_cast<float>(input[input_offset]);
}
output[output_index] = static_cast<T>(tempSum);
}
#endif // __SUM_CUDA_H__
#ifndef __SUM_INFO_H__
#define __SUM_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <algorithm>
#include <cstddef>
#include <vector>
namespace op::sum {
class SumInfo {
SumInfo() = default;
public:
infiniDtype_t dtype;
std::vector<size_t> permuted_input_shape; // need to permute
std::vector<size_t> output_shape;
std::vector<ptrdiff_t> permuted_input_strides; // need to permute
std::vector<ptrdiff_t> output_strides;
size_t reduce_dim_size; // reduce dim size
size_t reduce_num; // number of elements to reduce for each output element
size_t input_size; // total number of input elements
size_t output_size; // total number of output elements
static utils::Result<SumInfo> create(
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto input_shape = input_desc->shape();
auto input_strides = input_desc->strides();
size_t input_ndim = input_desc->ndim();
size_t reduce_num = 1;
for (size_t i = 0; i < dim_size; i++) {
reduce_num *= input_shape[dim[i]];
}
std::vector<size_t> permute_order;
for (size_t i = 0; i < input_ndim; i++) {
if (std::find(dim, dim + dim_size, i) == dim + dim_size) {
permute_order.push_back(i);
}
}
for (size_t i = 0; i < dim_size; i++) {
permute_order.push_back(dim[i]);
}
std::vector<size_t> permuted_input_shape;
std::vector<ptrdiff_t> permuted_input_strides;
for (size_t i = 0; i < permute_order.size(); i++) {
permuted_input_shape.push_back(input_shape[permute_order[i]]);
permuted_input_strides.push_back(input_strides[permute_order[i]]);
}
return utils::Result<SumInfo>(SumInfo{input_desc->dtype(),
permuted_input_shape,
output_desc->shape(),
permuted_input_strides,
output_desc->strides(),
dim_size,
reduce_num,
input_desc->numel(),
output_desc->numel()});
}
};
} // namespace op::sum
#endif
#ifndef __SUM_METAX_H__
#define __SUM_METAX_H__
#include "../sum_desc.h"
DESCRIPTOR(metax);
#endif
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "sum_metax.h"
namespace op::sum::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename T>
infiniStatus_t launchKernel(
const SumInfo &info,
T *output, const T *input,
hcStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_hc = permuted_input_shape_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
T zero = static_cast<T>(0.0f);
CHECK_METAX(hcMemcpyAsync(output, &zero, sizeof(T), hcMemcpyHostToDevice, stream));
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
sumAllKernel<BLOCK_SIZE, T, T><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(T), stream>>>(
output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc);
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
sumKernel<BLOCK_SIZE, T><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_SUM(BLOCK_SIZE, T) \
launchKernel<BLOCK_SIZE, T>( \
_info, \
(T *)output, (const T *)input, \
stream, workspace, workspace_size)
#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_SUM(BLOCK_SIZE, __hpcc_bfloat16); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_SUM(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_SUM(BLOCK_SIZE, float); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
CALCULATE_SUM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
CALCULATE_SUM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sum::metax
#ifndef __SUM_MOORE_H__
#define __SUM_MOORE_H__
#include "../sum_desc.h"
DESCRIPTOR(moore);
#endif
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "sum_moore.h"
namespace op::sum::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename T>
infiniStatus_t launchKernel(
const SumInfo &info,
T *output, const T *input,
musaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_musa = permuted_input_shape_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
if constexpr (std::is_same_v<T, __mt_bfloat16>) {
// 需要解决 moore不支持bf16的atomic add的问题
float zero = 0.0f;
float *tmp_output;
CHECK_MOORE(musaMalloc(&tmp_output, sizeof(float)));
CHECK_MOORE(musaMemcpyAsync(tmp_output, &zero, sizeof(float), musaMemcpyHostToDevice, stream));
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
sumAllKernel<BLOCK_SIZE, T, float><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), stream>>>(
tmp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
// 可以自定义 kernel,将 float -> T,这里直接memcpy了
float host_val;
CHECK_MOORE(musaMemcpy(&host_val, tmp_output, sizeof(float), musaMemcpyDeviceToHost));
T out_val = static_cast<T>(host_val);
CHECK_MOORE(musaMemcpyAsync(output, &out_val, sizeof(T), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaFree(tmp_output));
} else {
T zero = static_cast<T>(0.0f);
CHECK_MOORE(musaMemcpyAsync(output, &zero, sizeof(T), musaMemcpyHostToDevice, stream));
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
sumAllKernel<BLOCK_SIZE, T, T><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(T), stream>>>(
output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
}
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
sumKernel<BLOCK_SIZE, T><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define CALCULATE_SUM(BLOCK_SIZE, T) \
launchKernel<BLOCK_SIZE, T>( \
_info, \
(T *)output, (const T *)input, \
stream, workspace, workspace_size)
#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_SUM(BLOCK_SIZE, __mt_bfloat16); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_SUM(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_SUM(BLOCK_SIZE, float); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
CALCULATE_SUM_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
CALCULATE_SUM_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sum::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "sum_nvidia.cuh"
namespace op::sum::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = SumInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename T>
infiniStatus_t launchKernel(
const SumInfo &info,
T *output, const T *input,
cudaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_cuda = permuted_input_shape_cuda + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_cuda = permuted_input_strides_cuda + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
T zero = static_cast<T>(0.0f);
CHECK_CUDA(cudaMemcpyAsync(output, &zero, sizeof(T), cudaMemcpyHostToDevice, stream));
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
sumAllKernel<BLOCK_SIZE, T, T><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(T), stream>>>(
output, input, input_size, input_ndim, permuted_input_shape_cuda, permuted_input_strides_cuda);
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
sumKernel<BLOCK_SIZE, T><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_cuda, output_shape_cuda, permuted_input_strides_cuda, output_strides_cuda);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
#define CALCULATE_SUM(BLOCK_SIZE, T) \
launchKernel<BLOCK_SIZE, T>( \
_info, \
(T *)output, (const T *)input, \
stream, workspace, workspace_size)
#define CALCULATE_SUM_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_SUM(BLOCK_SIZE, __nv_bfloat16); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_SUM(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_SUM(BLOCK_SIZE, float); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sum::nvidia
#ifndef __SUM_NVIDIA_H__
#define __SUM_NVIDIA_H__
#include "../sum_desc.h"
DESCRIPTOR(nvidia);
#endif // __SUM_CUDA_API_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/sum.h"
#include <vector>
#ifdef ENABLE_CPU_API
#include "cpu/sum_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/sum_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/sum_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/sum_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/sum_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateSumDescriptor(
infiniopHandle_t handle,
infiniopSumDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::sum::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::sum::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
input_desc, \
dim, \
dim_size, \
keepdim)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetSumWorkspaceSize(infiniopSumDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::sum::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopSum(
infiniopSumDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::sum::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, input, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroySumDescriptor(infiniopSumDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::sum::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#ifndef INFINIOP_SUM_DESCRIPTOR_H_
#define INFINIOP_SUM_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::sum::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
SumInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
SumInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t *dim, \
size_t dim_size, \
bool keepdim); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
const void *input, \
void *stream) const; \
}; \
}
#endif
#include "topk_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
#include <algorithm>
#include <vector>
namespace op::topk::cpu {
Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t values_output_desc,
infiniopTensorDescriptor_t indices_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t k,
size_t dim,
bool largest,
bool sorted) {
auto result = TopKInfo::create(values_output_desc, indices_output_desc, input_desc, k, dim, largest, sorted);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <typename Tdata>
infiniStatus_t calculateTopK(
const TopKInfo &info,
Tdata *values_output,
int32_t *indices_output,
const Tdata *input,
size_t k,
size_t dim,
bool largest,
bool sorted) {
if (k == 0) {
return INFINI_STATUS_SUCCESS;
}
for (size_t i = 0; i < info.n_iteration; i++) {
size_t index = i;
size_t input_start = 0;
size_t output_start = 0;
for (size_t j = info.ndim - 1; j >= 0; j--) {
if (j == dim) {
continue;
}
input_start += (index % info.input_shape[j]) * info.input_strides[j];
output_start += (index % info.output_shape[j]) * info.output_strides[j];
index /= info.input_shape[j];
}
using elem_t = std::pair<Tdata, size_t>;
std::vector<elem_t> vi_queue(info.dim_elements);
for (size_t j = 0; j < info.dim_elements; j++) {
vi_queue[j].first = input[input_start + j * info.input_strides[dim]];
vi_queue[j].second = j;
}
bool use_partial_sort = static_cast<size_t>(k) * 64 <= info.dim_elements;
if (use_partial_sort) {
if (largest) {
std::partial_sort(vi_queue.begin(), vi_queue.begin() + k, vi_queue.end(),
[](const elem_t &a, const elem_t &b) -> bool {
return utils::cast<float>(a.first) > utils::cast<float>(b.first);
});
} else {
std::partial_sort(vi_queue.begin(), vi_queue.begin() + k, vi_queue.end(),
[](const elem_t &a, const elem_t &b) -> bool {
return utils::cast<float>(a.first) < utils::cast<float>(b.first);
});
}
} else {
if (largest) {
std::nth_element(vi_queue.begin(), vi_queue.begin() + k - 1, vi_queue.end(),
[](const elem_t &a, const elem_t &b) -> bool {
return utils::cast<float>(a.first) > utils::cast<float>(b.first);
});
if (sorted) {
std::sort(vi_queue.begin(), vi_queue.begin() + k, // 注意:PyTorch 这里是 k,不是 k-1
[](const elem_t &a, const elem_t &b) -> bool {
return utils::cast<float>(a.first) > utils::cast<float>(b.first);
});
}
} else {
std::nth_element(vi_queue.begin(), vi_queue.begin() + k - 1, vi_queue.end(),
[](const elem_t &a, const elem_t &b) -> bool {
return utils::cast<float>(a.first) < utils::cast<float>(b.first);
});
if (sorted) {
std::sort(vi_queue.begin(), vi_queue.begin() + k, // 注意:PyTorch 这里是 k,不是 k-1
[](const elem_t &a, const elem_t &b) -> bool {
return utils::cast<float>(a.first) < utils::cast<float>(b.first);
});
}
}
}
for (size_t j = 0; j < k; j++) {
values_output[output_start + j * info.output_strides[dim]] = vi_queue[j].first;
indices_output[output_start + j * info.output_strides[dim]] = (int32_t)vi_queue[j].second;
}
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *values_output,
void *indices_output,
const void *input,
size_t k,
size_t dim,
bool largest,
bool sorted,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return calculateTopK<fp16_t>(_info, (fp16_t *)values_output, (int32_t *)indices_output, reinterpret_cast<const fp16_t *>(input), k, dim, largest, sorted);
case INFINI_DTYPE_F32:
return calculateTopK<float>(_info, (float *)values_output, (int32_t *)indices_output, reinterpret_cast<const float *>(input), k, dim, largest, sorted);
case INFINI_DTYPE_BF16:
return calculateTopK<bf16_t>(_info, (bf16_t *)values_output, (int32_t *)indices_output, reinterpret_cast<const bf16_t *>(input), k, dim, largest, sorted);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::topk::cpu
#ifndef __INFINIOP_TOPK_CPU_H__
#define __INFINIOP_TOPK_CPU_H__
#include "../topk_desc.h"
DESCRIPTOR(cpu);
#endif // __INFINIOP_TOPK_CPU_H__
#ifndef __TOPK_CUDA_KERNEL_CUH__
#define __TOPK_CUDA_KERNEL_CUH__
#include <cmath> // NAN
#include <cub/block/block_radix_sort.cuh>
#include <stdint.h>
namespace op::topk::cuda {
__forceinline__ __device__ __host__ size_t baseOffsetExcludingDim(
size_t flat_row,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides,
size_t dim) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
if (i == dim) {
continue;
}
res += (flat_row % shape[i]) * strides[i];
flat_row /= shape[i];
}
return res;
}
__forceinline__ __device__ __host__ size_t indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
template <typename Tdata>
__device__ __forceinline__ float to_float(Tdata v);
template <>
__device__ __forceinline__ float to_float<float>(float v) { return v; }
template <>
__device__ __forceinline__ float to_float<half>(half v) { return __half2float(v); }
#if defined(ENABLE_MOORE_API)
using bf16_t = __mt_bfloat16;
#elif defined(ENABLE_METAX_API)
using bf16_t = __hpcc_bfloat16;
#else
// CUDA / NVIDIA / ILUVATAR
using bf16_t = __nv_bfloat16;
#endif
template <>
__device__ __forceinline__ float to_float<bf16_t>(bf16_t v) {
return __bfloat162float(v);
}
// float -> ordered uint32
__device__ __forceinline__ uint32_t float_to_uint_ordered(float value) {
uint32_t bits = *reinterpret_cast<uint32_t *>(&value);
uint32_t mask = (uint32_t)(-((int32_t)bits >> 31)) | 0x80000000u;
return bits ^ mask;
}
template <typename Tdata>
__global__ void gather_rowwise(const Tdata *input, uint32_t *cur_vals, int32_t *cur_idx,
size_t rows, size_t n, size_t ndim, size_t dim, const size_t *shape, const ptrdiff_t *strides) {
size_t row = blockIdx.y;
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (row >= rows || i >= n) {
return;
}
size_t base = baseOffsetExcludingDim(row, ndim, shape, strides, dim);
size_t off = base + i * strides[dim];
cur_vals[row * n + i] = float_to_uint_ordered(to_float<Tdata>(input[off]));
cur_idx[row * n + i] = i;
}
__global__ void init_row_state(int32_t *cur_n, int32_t *rem_k, int32_t *out_pos, size_t rows, size_t n, size_t k) {
int32_t r = blockIdx.x * blockDim.x + threadIdx.x;
if (r < rows) {
cur_n[r] = n;
rem_k[r] = k;
out_pos[r] = 0;
}
}
__global__ void zero_row_counters(int32_t *ones_count, int32_t *zeros_count, size_t rows) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
if (r < rows) {
ones_count[r] = 0;
zeros_count[r] = 0;
}
}
template <size_t BLOCK_SIZE>
__global__ void partition_rowwise(const uint32_t *cur_vals, int32_t *cur_idx, uint32_t *ones_vals, int32_t *ones_idx,
uint32_t *zeros_vals, int32_t *zeros_idx, const int32_t *cur_n, size_t rows, size_t n,
int32_t bit_pos, bool largest, int32_t *ones_count, int32_t *zeros_count) {
int32_t row = blockIdx.y;
if (row >= rows) {
return;
}
__shared__ uint32_t sh1_vals[BLOCK_SIZE];
__shared__ int32_t sh1_idx[BLOCK_SIZE];
__shared__ uint32_t sh0_vals[BLOCK_SIZE];
__shared__ int32_t sh0_idx[BLOCK_SIZE];
__shared__ int sh1_n, sh0_n;
__shared__ int32_t base1, base0;
int32_t tid = threadIdx.x;
if (tid == 0) {
sh1_n = 0;
sh0_n = 0;
}
__syncthreads();
int32_t i = blockIdx.x * blockDim.x + tid;
int32_t cn = cur_n[row];
if (i < cn) {
int32_t off = row * n + i;
int32_t idx = cur_idx[off];
uint32_t key = cur_vals[off];
uint32_t cmp_key = largest ? key : ~key;
int32_t b = (cmp_key >> bit_pos) & 1;
if (b) {
int32_t p = atomicAdd(&sh1_n, 1);
sh1_vals[p] = key;
sh1_idx[p] = idx;
} else {
int32_t p = atomicAdd(&sh0_n, 1);
sh0_vals[p] = key;
sh0_idx[p] = idx;
}
}
__syncthreads();
if (tid == 0) {
base1 = atomicAdd(&ones_count[row], sh1_n);
base0 = atomicAdd(&zeros_count[row], sh0_n);
}
__syncthreads();
for (int32_t j = tid; j < sh1_n; j += blockDim.x) {
int32_t o = row * n + base1 + j;
ones_vals[o] = sh1_vals[j];
ones_idx[o] = sh1_idx[j];
}
for (int32_t j = tid; j < sh0_n; j += blockDim.x) {
int32_t o = row * n + base0 + j;
zeros_vals[o] = sh0_vals[j];
zeros_idx[o] = sh0_idx[j];
}
}
template <size_t BLOCK_SIZE>
__global__ void decide_and_compact(uint32_t *cur_vals, int32_t *cur_idx, const uint32_t *ones_vals, const int32_t *ones_idx, const uint32_t *zeros_vals, const int32_t *zeros_idx,
const int32_t *ones_count, const int32_t *zeros_count, int32_t *cur_n, int32_t *rem_k, int32_t *out_pos,
uint32_t *sel_vals, int32_t *sel_idx, size_t rows, size_t n, size_t k) {
int32_t row = blockIdx.x;
if (row >= rows) {
return;
}
int32_t tid = threadIdx.x;
int32_t rem = rem_k[row];
if (rem <= 0) {
return;
}
int32_t oc = ones_count[row];
int32_t zc = zeros_count[row];
int32_t pos = out_pos[row];
bool keep_ones = (oc >= rem);
if (!keep_ones) {
for (int32_t j = tid; j < oc; j += blockDim.x) {
if (pos + j < k) {
int32_t o = row * n + j;
sel_vals[row * k + pos + j] = ones_vals[o];
sel_idx[row * k + pos + j] = ones_idx[o];
}
}
}
__syncthreads();
if (tid == 0) {
if (keep_ones) {
cur_n[row] = oc;
} else {
out_pos[row] = pos + oc;
rem_k[row] = rem - oc;
cur_n[row] = zc;
}
}
__syncthreads();
int32_t new_n = cur_n[row];
for (int32_t j = tid; j < new_n; j += blockDim.x) {
int32_t o = row * n + j;
cur_vals[o] = keep_ones ? ones_vals[o] : zeros_vals[o];
cur_idx[o] = keep_ones ? ones_idx[o] : zeros_idx[o];
}
}
template <size_t BLOCK_SIZE>
__global__ void take_remaining(const uint32_t *cur_vals, const int32_t *cur_idx, const int32_t *cur_n, const int32_t *rem_k, const int32_t *out_pos,
uint32_t *sel_vals, int32_t *sel_idx, size_t rows, size_t n, size_t k) {
int32_t row = blockIdx.x;
int32_t tid = threadIdx.x;
if (row >= rows) {
return;
}
int32_t rem = rem_k[row];
int32_t pos = out_pos[row];
int32_t cn = cur_n[row];
int32_t take = rem;
if (take > cn) {
take = cn;
}
for (int32_t j = tid; j < take; j += blockDim.x) {
if (pos + j < k) {
int32_t o = row * k + pos + j;
sel_vals[o] = cur_vals[row * n + j];
sel_idx[o] = cur_idx[row * n + j];
}
}
}
template <typename Tdata>
__global__ void scatter_to_output(const Tdata *input, const int32_t *sel_idx, Tdata *values_out, int32_t *indices_out,
size_t rows, size_t k, size_t ndim, size_t dim, const size_t *input_shape, const ptrdiff_t *input_strides,
const size_t *output_shape, const ptrdiff_t *output_strides) {
int32_t row = blockIdx.y;
int32_t j = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= rows || j >= k) {
return;
}
int32_t output_base = baseOffsetExcludingDim(row, ndim, output_shape, output_strides, dim);
int32_t output_off = output_base + j * output_strides[dim];
int32_t input_base = baseOffsetExcludingDim(row, ndim, input_shape, input_strides, dim);
int32_t input_off = input_base + sel_idx[row * k + j] * input_strides[dim];
values_out[output_off] = input[input_off];
indices_out[output_off] = sel_idx[row * k + j];
}
} // namespace op::topk::cuda
#endif // __TOPK_CUDA_KERNEL_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment