Commit c2e87202 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/142

parents 41818f84 c203635b
#ifndef __MUL_CPU_H__
#define __MUL_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(mul, cpu)
namespace op::mul::cpu {
typedef struct MulOp {
public:
static constexpr size_t num_inputs = 2;
template <typename T>
T operator()(const T &a, const T &b) const {
return a * b;
}
} MulOp;
} // namespace op::mul::cpu
#endif // __MUL_CPU_H__
#include "mul_cuda.cuh"
#include "mul_cuda_internal.cuh"
namespace op::mul::cuda {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::mul::cuda
#ifndef __MUL_CUDA_API_H__
#define __MUL_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR(mul, cuda)
#endif // __MUL_CUDA_API_H__
#ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace op::mul::cuda {
typedef struct MulOp {
static constexpr size_t num_inputs = 2;
template <typename T>
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
if constexpr (std::is_same_v<T, half2>) {
return __hmul2(a, b);
} else if constexpr (std::is_same_v<T, half>) {
return __hmul(a, b);
} else if constexpr (std::is_same_v<T, float>) {
return __fmul_rn(a, b);
} else {
return a * b;
}
}
} MulOp;
} // namespace op::mul::cuda
#endif // __MUL_CUDA_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/mul.h"
#ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/mul_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateMulDescriptor(
infiniopHandle_t handle,
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::mul::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, \
b_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopMul(
infiniopMulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "random_sample_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../devices/cpu/cpu_handle.h"
#include "../../../tensor.h"
#include "../info.h"
#include "infinicore.h"
#include <algorithm>
namespace op::random_sample::cpu {
......@@ -15,29 +15,14 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dt_i = result_desc->dtype();
auto dt_p = probs_desc->dtype();
CHECK_DTYPE(dt_i,
INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_API_OR(result_desc->ndim(), 0,
return INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_API_OR(probs_desc->ndim(), 1,
return INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_API_OR(probs_desc->stride(0), 1,
return INFINI_STATUS_BAD_TENSOR_STRIDES);
auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(
dt_i,
dt_p,
probs_desc->dim(0),
result.take(),
0,
nullptr,
handle->device,
handle->device_id);
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
......@@ -55,36 +40,42 @@ struct ComputeType<fp16_t> {
using type = float;
};
template <class Tidx, class Tval>
struct Scheme {
using Tcompute = typename ComputeType<Tval>::type;
struct Algo {
static Tcompute get(void const *ptr, size_t i) {
return utils::cast<Tcompute, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
template <class Tidx, class Tval>
static auto get(void const *ptr, size_t i) {
return utils::cast<typename ComputeType<Tval>::type, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
}
static void argmax(
void *result, void const *probs, size_t n) {
template <class Tidx, class Tval>
infiniStatus_t argmax(
void *workspace, size_t workspace_size,
void *result, void const *probs, size_t n,
void *stream) {
auto idx = reinterpret_cast<Tidx *>(result);
*idx = 0;
auto max_val = get(probs, 0);
auto max_val = get<Tidx, Tval>(probs, 0);
for (size_t i = 0; i < n; i++) {
if (auto val = get(probs, i); val > max_val) {
if (auto val = get<Tidx, Tval>(probs, i); val > max_val) {
max_val = val;
*idx = static_cast<Tidx>(i);
}
}
return INFINI_STATUS_SUCCESS;
}
static void random(
template <class Tidx, class Tval>
infiniStatus_t random(
void *workspace, size_t workspace_size,
void *result, void const *probs, size_t n,
float random_val, float topp, int topk, float temperature) {
float random_val, float topp, int topk, float temperature,
void *stream) {
struct KVPair {
Tidx idx;
Tcompute val;
typename ComputeType<Tval>::type val;
bool operator<(const KVPair &other) const {
return val > other.val;
......@@ -95,7 +86,7 @@ struct Scheme {
// build & sort
std::vector<KVPair> pairs(n);
for (size_t i = 0; i < n; i++) {
pairs[i] = {static_cast<Tidx>(i), get(probs, i)};
pairs[i] = {static_cast<Tidx>(i), get<Tidx, Tval>(probs, i)};
}
std::sort(pairs.begin(), pairs.end());
// softmax & sum
......@@ -115,68 +106,10 @@ struct Scheme {
break;
}
}
}
};
template <class Tidx, class Tval>
void switch_f(
size_t n,
void *result, const void *probs,
float random_val, float topp, int topk, float temperature) {
if (random_val == 0 || topp == 0 || topk == 1 || temperature == 0) {
Scheme<Tidx, Tval>::argmax(result, probs, n);
} else {
Scheme<Tidx, Tval>::random(result, probs, n, random_val, topp, topk, temperature);
}
}
template <class Tidx>
void switch_val(
infiniDtype_t dt_p, size_t n,
void *result, void const *probs,
float random_val, float topp, int topk, float temperature) {
switch (dt_p) {
case INFINI_DTYPE_F16:
switch_f<Tidx, fp16_t>(n, result, probs, random_val, topp, topk, temperature);
break;
case INFINI_DTYPE_F32:
switch_f<Tidx, float>(n, result, probs, random_val, topp, topk, temperature);
break;
case INFINI_DTYPE_F64:
switch_f<Tidx, double>(n, result, probs, random_val, topp, topk, temperature);
break;
default:
// unreachable
std::abort();
}
}
void switch_idx(
infiniDtype_t dt_i, infiniDtype_t dt_p, size_t n,
void *result, void const *probs,
float random_val, float topp, int topk, float temperature) {
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>(dt_p, n, result, probs, random_val, topp, topk, temperature); \
break
switch (dt_i) {
CASE(INFINI_DTYPE_I8, int8_t);
CASE(INFINI_DTYPE_I16, int16_t);
CASE(INFINI_DTYPE_I32, int32_t);
CASE(INFINI_DTYPE_I64, int64_t);
CASE(INFINI_DTYPE_U8, uint8_t);
CASE(INFINI_DTYPE_U16, uint16_t);
CASE(INFINI_DTYPE_U32, uint32_t);
CASE(INFINI_DTYPE_U64, uint64_t);
default:
// unreachable
std::abort();
return INFINI_STATUS_SUCCESS;
}
#undef CASE
}
};
infiniStatus_t Descriptor::calculate(
void *workspace,
......@@ -189,7 +122,11 @@ infiniStatus_t Descriptor::calculate(
float temperature,
void *stream) const {
switch_idx(_dt_i, _dt_p, _n, result, probs, random_val, topp, topk, temperature);
Calculate::calculate<Algo>(
Algo{}, _info, workspace, workspace_size,
result, probs,
random_val, topp, topk, temperature,
stream);
return INFINI_STATUS_SUCCESS;
}
......
#include "../../../devices/cuda/cuda_handle.cuh"
#include "../info.h"
#include "random_sample_cuda.cuh"
#include "random_sample_kernel.cuh"
namespace op::random_sample::cuda {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size;
#define CASE_P(CASE, Tidx, Tval) \
case CASE: { \
auto workspace_result = calculateWorkspace<Tidx, Tval>(info.n); \
CHECK_RESULT(workspace_result); \
workspace_size = workspace_result.take(); \
} break
#define CASE_I(CASE, Tidx) \
case CASE: \
switch (info.dt_p) { \
CASE_P(INFINI_DTYPE_F16, Tidx, half); \
CASE_P(INFINI_DTYPE_F32, Tidx, float); \
CASE_P(INFINI_DTYPE_F64, Tidx, double); \
default: \
abort(); \
} \
break
switch (info.dt_i) {
CASE_I(INFINI_DTYPE_I8, int8_t);
CASE_I(INFINI_DTYPE_I16, int16_t);
CASE_I(INFINI_DTYPE_I32, int32_t);
CASE_I(INFINI_DTYPE_I64, int64_t);
CASE_I(INFINI_DTYPE_U8, uint8_t);
CASE_I(INFINI_DTYPE_U16, uint16_t);
CASE_I(INFINI_DTYPE_U32, uint32_t);
CASE_I(INFINI_DTYPE_U64, uint64_t);
default:
abort();
}
#undef CASE_I
#undef CASE_P
*desc_ptr = new Descriptor(
info,
workspace_size,
new Opaque{handle->internal()},
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
size_t Descriptor::minWorkspaceSize() const {
return _min_workspace_size;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *result,
const void *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream) const {
if (workspace_size < _min_workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
auto block_size = _opaque->internal->blockSizeX();
Calculate::calculate<Algo>(
Algo{block_size}, _info, workspace, workspace_size,
result, probs,
random_val, topp, topk, temperature,
stream);
return INFINI_STATUS_SUCCESS;
}
} // namespace op::random_sample::cuda
#ifndef __RANDOM_SAMPLE_CUDA_CUH__
#define __RANDOM_SAMPLE_CUDA_CUH__
#include "../random_sample.h"
DESCRIPTOR(cuda)
#endif // __RANDOM_SAMPLE_CUDA_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "infinicore.h"
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh>
namespace op::random_sample::cuda {
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
template <class T>
static cudaError argMax_(
cub::KeyValuePair<int, T> *kv_pair,
const T *logits,
int n,
void *workspace_ptr,
size_t &workspace_len,
cudaStream_t stream) {
return cub::DeviceReduce::ArgMax(
workspace_ptr, workspace_len,
logits, kv_pair, n,
stream);
}
template <class Tval, class Tidx>
static cudaError radixSort(
void *workspace_ptr, size_t &workspace_len,
const Tval *key_in, Tval *key_out,
const Tidx *val_in, Tidx *val_out,
int n,
cudaStream_t stream) {
return cub::DeviceRadixSort::SortPairsDescending(
workspace_ptr, workspace_len,
key_in, key_out,
val_in, val_out,
n,
0, sizeof(Tval) * 8,
stream);
}
template <class T>
static cudaError inclusiveSum(
void *workspace_ptr, size_t &workspace_len,
T *data, int n,
cudaStream_t stream) {
return cub::DeviceScan::InclusiveSum(
workspace_ptr, workspace_len,
data, data, n,
stream);
}
// ↑↑↑ 重新封装 cub api,减少模板参数,方便调用
// ↓↓↓ 计算 workspace
// 地址对齐到 256
static constexpr size_t align256(size_t size) {
return (size + 255) & (~255);
}
template <class Tidx, class Tval>
utils::Result<size_t> calculateWorkspace(size_t n_) {
const auto n = static_cast<int>(n_);
size_t argmax;
CHECK_CUDA(argMax_<Tval>(
nullptr, nullptr, n,
nullptr, argmax,
nullptr));
// 前 256 字节用于 kv pair
argmax += 256;
// indices
size_t size_random = align256(sizeof(Tidx) * n);
// sorted
size_random += align256(sizeof(Tval) * n);
// indices_out
size_random += align256(sizeof(Tidx) * n);
// cub device api
size_t size_radix_sort;
CHECK_CUDA((radixSort<Tval, Tidx>(
nullptr, size_radix_sort,
nullptr, nullptr,
nullptr, nullptr,
n,
nullptr)));
size_t size_inclusive_sum;
CHECK_CUDA(inclusiveSum<Tval>(
nullptr, size_inclusive_sum,
nullptr, n,
nullptr));
size_random += cub::Max()(size_radix_sort, size_inclusive_sum);
return utils::Result<size_t>(cub::Max()(argmax, size_random));
}
// ↑↑↑ 计算 workspace
// ↓↓↓ 通过特化将 fp16_t 转换为 half
template <class Tval>
struct CudaTval {
using Type = Tval;
};
template <>
struct CudaTval<fp16_t> {
using Type = half;
};
// ↑↑↑ 通过特化将 fp16_t 转换为 half
// ↓↓↓ 用于采样过程的小型 kernel
// cuda toolkit 11.x 带的 cub::DeviceReduce::ArgMax 只接受 cub::KeyValuePair<int, Tval> 输出。
// 这个 kernel 用于取出序号
template <class Tidx, class Tval>
static __global__ void castIdx(Tidx *result, const cub::KeyValuePair<int, Tval> *kv_pair) {
*result = kv_pair->key;
}
// 填充排序要求的序号数组
template <class Tidx>
static __global__ void fillIndices(Tidx *indices, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
indices[i] = i;
}
}
// random sample 使用的 softmax 可以简化为一个基本的线性映射
// 由于已经排序,最大值就是第一个数字
// 第一个数字需要被多个 block 读取,不能写
template <class T>
static __global__ void partialSoftmaxKernel(
T *__restrict__ data, int n,
float temperature) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (0 < i && i < n) {
float max = __ldg(data);
data[i] = (T)expf(((float)data[i] - max) / temperature);
}
}
// 将第一个数字写成 1,即 exp(0)
template <class T>
static __global__ void setSoftmaxMaxKernel(
T *__restrict__ data) {
*data = 1;
}
// 直接 for 循环遍历采样
// 这个 kernel 仅用于避免将数据拷贝到 cpu
template <class Tval, class Tidx>
static __global__ void randomSampleKernel(
Tidx *__restrict__ result,
const Tval *__restrict__ sorted,
const Tidx *__restrict__ indices_out,
size_t n,
float random, float topp, size_t topk) {
topk = cub::Min()(topk, n);
auto p = (Tval)(random * cub::Min()(topp * (float)sorted[n - 1], (float)sorted[topk - 1]));
for (size_t i = 0;; ++i) {
if ((sorted[i]) >= p) {
*result = indices_out[i];
return;
}
}
}
// ↑↑↑ 用于采样过程的小型 kernel
struct Algo {
int block_size;
template <class Tidx, class Tval_>
infiniStatus_t argmax(
void *workspace, size_t workspace_size,
void *result, const void *probs, size_t n,
void *stream_) const {
using Tval = typename CudaTval<Tval_>::Type;
auto stream = (cudaStream_t)stream_;
auto logits = (Tval *)probs;
auto kv_pair = (cub::KeyValuePair<int, Tval> *)workspace;
workspace = (void *)((char *)workspace + 256);
workspace_size -= 256;
argMax_(
kv_pair,
logits,
n,
workspace,
workspace_size, stream);
castIdx<<<1, 1, 0, stream>>>((Tidx *)result, kv_pair);
return INFINI_STATUS_SUCCESS;
}
template <class Tidx, class Tval_>
infiniStatus_t random(
void *workspace_, size_t workspace_size,
void *result_, const void *probs, size_t n,
float random_val, float topp, int topk, float temperature,
void *stream_) const {
using Tval = typename CudaTval<Tval_>::Type;
auto stream = (cudaStream_t)stream_;
auto logits = (Tval *)probs;
auto result = (Tidx *)result_;
auto workspace = reinterpret_cast<size_t>(workspace_);
auto workspace_end = workspace + workspace_size;
auto indices = reinterpret_cast<Tidx *>(workspace);
workspace += align256(sizeof(Tidx) * n);
auto sorted = reinterpret_cast<Tval *>(workspace);
workspace += align256(sizeof(Tval) * n);
auto indices_out = reinterpret_cast<Tidx *>(workspace);
workspace += align256(sizeof(Tidx) * n);
workspace_ = reinterpret_cast<void *>(workspace);
workspace_size = workspace_end - workspace;
auto block = cub::Min()((size_t)block_size, n);
auto grid = (n + block - 1) / block;
// sort
fillIndices<<<grid, block, 0, stream>>>(indices, n);
CHECK_CUDA(radixSort(
workspace_, workspace_size,
logits, sorted,
indices, indices_out,
n,
stream));
// softmax
partialSoftmaxKernel<<<grid, block, 0, stream>>>(sorted, n, temperature);
setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted);
// sum
CHECK_CUDA(inclusiveSum(
workspace_, workspace,
sorted, n,
stream));
// sample
randomSampleKernel<<<1, 1, 0, stream>>>(
result,
sorted, indices_out, n,
random_val, topp, topk);
return INFINI_STATUS_SUCCESS;
}
};
} // namespace op::random_sample::cuda
#ifndef __RANDOM_SAMPLE_INFO_H__
#define __RANDOM_SAMPLE_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
namespace op::random_sample {
struct RandomSampleInfo {
infiniDtype_t dt_i, dt_p;
size_t n;
static utils::Result<RandomSampleInfo> create(
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) {
auto dt_i = result_desc->dtype();
auto dt_p = probs_desc->dtype();
CHECK_DTYPE_ANY_INT(dt_i);
CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_OR_RETURN(result_desc->ndim() == 0, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(probs_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(probs_desc->stride(0) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
return utils::Result<RandomSampleInfo>({dt_i, dt_p, probs_desc->dim(0)});
}
};
} // namespace op::random_sample
#endif // __RANDOM_SAMPLE_INFO_H__
......@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/random_sample_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/random_sample_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateRandomSampleDescriptor(
infiniopHandle_t handle,
......@@ -25,6 +28,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -38,9 +44,10 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
case CASE: { \
using Ptr = const op::random_sample::NAMESPACE::Descriptor *; \
*size = reinterpret_cast<Ptr>(desc)->minWorkspaceSize(); \
} \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
......@@ -48,6 +55,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -82,6 +92,9 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -103,6 +116,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __RANDOM_SAMPLE_H__
#define __RANDOM_SAMPLE_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
......@@ -11,22 +11,18 @@
struct Opaque; \
Opaque *_opaque; \
\
infiniDtype_t _dt_i, _dt_p; \
size_t _n, _min_workspace_size; \
RandomSampleInfo _info; \
size_t _min_workspace_size; \
\
Descriptor( \
infiniDtype_t dt_i, \
infiniDtype_t dt_p, \
size_t n, \
RandomSampleInfo info, \
size_t min_workspace_size, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dt_i(dt_i), \
_dt_p(dt_p), \
_n(n), \
_info(info), \
_min_workspace_size(min_workspace_size) {} \
\
public: \
......@@ -53,4 +49,96 @@
}; \
}
namespace op::random_sample {
struct CalculateArgs {
void *workspace;
size_t workspace_size;
void *result;
const void *probs;
float random_val, topp, temperature;
int topk;
void *stream;
};
class Calculate {
template <class Tidx, class Tval, class Algo>
static void switch_f(Algo algo, size_t n, CalculateArgs args) {
if (args.random_val == 0 || args.topp == 0 || args.topk == 1 || args.temperature == 0) {
algo.template argmax<Tidx, Tval>(
args.workspace, args.workspace_size,
args.result, args.probs, n,
args.stream);
} else {
algo.template random<Tidx, Tval>(
args.workspace, args.workspace_size,
args.result, args.probs, n,
args.random_val, args.topp, args.topk, args.temperature,
args.stream);
}
}
template <class Tidx, class Algo>
static void switch_val(
Algo algo,
infiniDtype_t dt_p, size_t n, CalculateArgs args) {
switch (dt_p) {
case INFINI_DTYPE_F16:
switch_f<Tidx, fp16_t>(algo, n, args);
break;
case INFINI_DTYPE_F32:
switch_f<Tidx, float>(algo, n, args);
break;
case INFINI_DTYPE_F64:
switch_f<Tidx, double>(algo, n, args);
break;
default:
// unreachable
std::abort();
}
}
public:
template <class Algo>
static infiniStatus_t calculate(
Algo algo,
RandomSampleInfo info,
void *workspace, size_t workspace_size,
void *result, const void *probs,
float random_val, float topp, int topk, float temperature,
void *stream) {
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>( \
algo, info.dt_p, info.n, \
{workspace, workspace_size, \
result, probs, \
random_val, topp, temperature, topk, \
stream}); \
break
switch (info.dt_i) {
CASE(INFINI_DTYPE_I8, int8_t);
CASE(INFINI_DTYPE_I16, int16_t);
CASE(INFINI_DTYPE_I32, int32_t);
CASE(INFINI_DTYPE_I64, int64_t);
CASE(INFINI_DTYPE_U8, uint8_t);
CASE(INFINI_DTYPE_U16, uint16_t);
CASE(INFINI_DTYPE_U32, uint32_t);
CASE(INFINI_DTYPE_U64, uint64_t);
default:
// unreachable
std::abort();
}
#undef CASE
return INFINI_STATUS_SUCCESS;
}
};
} // namespace op::random_sample
#endif // __RANDOM_SAMPLE_H__
#include "rearrange_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_copy.h>
namespace op::rearrange::ascend {
struct Descriptor::Opaque {
aclnnTensorDescriptor_t dst;
aclnnTensorDescriptor_t src;
void *workspace; // aclnnInplaceCopy workspace
uint64_t workspace_size;
~Opaque() {
delete dst;
delete src;
aclrtFree(workspace);
}
};
Descriptor::~Descriptor() {
delete _opaque;
};
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
auto dtype = y_desc->dtype();
auto ndim = y_desc->ndim();
auto shape = y_desc->shape();
CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < ndim; ++i) {
CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
}
auto dst_strides = y_desc->strides();
auto src_strides = x_desc->strides();
auto element_size = infiniSizeOf(dtype);
auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
CHECK_RESULT(result);
aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
void *workspace = nullptr;
aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
&workspace_size, &executor);
if (workspace_size != 0) {
CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
}
*desc_ptr = new Descriptor(
result.take(),
new Opaque{
dst,
src,
workspace,
workspace_size},
handle->device,
handle->device_id);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *y,
const void *x,
void *stream) const {
auto tdst = _opaque->dst->tensor;
auto tsrc = _opaque->src->tensor;
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
AclSetTensorAddr(executor, 0, tdst, y);
AclSetTensorAddr(executor, 1, tsrc, (void *)x);
CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
// Execute InplaceCopy
CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
executor, stream));
return INFINI_STATUS_SUCCESS;
}
} // namespace op::rearrange::ascend
#ifndef __REARRANGE_ASCEND_H__
#define __REARRANGE_ASCNED_H__
#include "../rearrange.h"
DESCRIPTOR(ascend)
#endif // __REARRANGE_ASCEND_H__
......@@ -15,20 +15,18 @@ infiniStatus_t Descriptor::create(
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = y_desc->dtype();
auto ndim = y_desc->ndim();
auto shape = y_desc->shape().data();
CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
auto y_shape = y_desc->shape();
auto x_shape = x_desc->shape();
CHECK_OR_RETURN(x_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(x_desc->ndim() == ndim, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_SAME_SHAPE(x_shape, y_shape);
for (size_t i = 0; i < ndim; ++i) {
CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
}
auto dst_strides = y_desc->strides().data();
auto src_strides = x_desc->strides().data();
auto dst_strides = y_desc->strides();
auto src_strides = x_desc->strides();
auto element_size = infiniSizeOf(dtype);
auto result = utils::RearrangeMeta::create(shape, dst_strides, src_strides, ndim, element_size);
auto result = utils::RearrangeMeta::create(y_shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(
......
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../tensor.h"
#include "rearrange_cuda.cuh"
#include "rearrange_kernel.cuh"
#include <algorithm>
#include <cmath>
#include <memory>
#include <stdint.h>
#include <vector>
namespace op::rearrange::cuda {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
auto dtype = y_desc->dtype();
auto ndim = y_desc->ndim();
CHECK_OR_RETURN(x_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(x_desc->ndim() == ndim, INFINI_STATUS_BAD_TENSOR_SHAPE);
// 保存临时vector对象
auto x_shape = x_desc->shape();
auto y_shape = y_desc->shape();
auto y_strides = y_desc->strides();
auto x_strides = x_desc->strides();
CHECK_SAME_SHAPE(x_shape, y_shape);
auto meta = utils::RearrangeMeta::create(
y_shape.data(),
y_strides.data(),
x_strides.data(),
ndim,
infiniSizeOf(dtype));
CHECK_RESULT(meta);
*desc_ptr = new Descriptor(
std::move(*meta),
new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
// 维度信息结构
struct Dim {
size_t len;
ARRAY_TYPE_STRIDE src_stride;
ARRAY_TYPE_STRIDE dst_stride;
};
// 分割维度结构
struct SplitDim {
size_t choose_idx;
size_t num_per_block;
size_t num_per_grid;
int array_struct_idx_block;
int array_struct_idx_grid;
size_t dim_len;
};
/**
* 根据给定的元数据准备张量重排参数,该函数主要完成以下工作:
* 1. 根据原始元数据调整单元大小,获取更适合GPU处理的单元大小
* 2. 将维度分配为CUDA块(block)维度和网格(grid)维度:
* 该步骤是核心,目标是为每个block分配尽可能多的相对连续的数据进行处理,
* 对无法完整放入块的维度进行分割,并记录分割维度信息,用于防止kernel访问越界,最大化内存访问局部性和计算效率
*/
utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta &original_meta, int max_threads) {
RearrangeParams params;
// 获取更适合GPU处理的单元大小,这里使用2的幂次方
auto meta_result = original_meta.distributeUnit({32, 16, 8, 4, 2, 1});
CHECK_RESULT(meta_result);
const utils::RearrangeMeta &meta = meta_result.take();
// 获取维度信息
const size_t ndim = meta.ndim();
const size_t unit = meta.unit();
// 特殊情况:无维度,只需要简单复制
if (ndim == 0) {
params.block_dim = 0;
params.block_len_total = 1;
params.block_len = {static_cast<ARRAY_TYPE_SIZE>(1)};
params.src_block_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
params.dst_block_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
params.grid_len = {static_cast<ARRAY_TYPE_SIZE>(1)};
params.src_grid_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
params.dst_grid_stride = {static_cast<ARRAY_TYPE_STRIDE>(0)};
params.unit_size = unit;
return utils::Result<RearrangeParams>(params);
}
// 从元数据中提取必要的信息
const ptrdiff_t *idx_strides = meta.idx_strides();
const ptrdiff_t *dst_strides = meta.dst_strides();
const ptrdiff_t *src_strides = meta.src_strides();
// 准备维度信息
std::vector<Dim> dims;
std::vector<size_t> shape;
dims.reserve(ndim);
shape.reserve(ndim);
auto prev_idx_stride = meta.count();
for (size_t i = 0; i < ndim; ++i) {
size_t len = prev_idx_stride / idx_strides[i];
shape.push_back(len);
dims.push_back({len, src_strides[i], dst_strides[i]});
prev_idx_stride = idx_strides[i];
}
// 计算src_strides的降序排序索引,类似于Rust版本中的src_strides_desc_idx
std::vector<size_t> src_strides_desc_idx(ndim);
for (size_t i = 0; i < ndim; ++i) {
src_strides_desc_idx[i] = i;
}
std::sort(src_strides_desc_idx.begin(), src_strides_desc_idx.end(),
[&dims](size_t a, size_t b) {
return std::abs(dims[a].src_stride) > std::abs(dims[b].src_stride);
});
// 根据最大线程数选择block和grid维度
const size_t block_size = max_threads;
std::vector<bool> block_dim_choose(ndim, false);
// 初始化计数器
size_t block_elements = 1;
size_t block_src_elements = 1;
size_t block_dst_elements = 1;
size_t src_choose_idx = ndim;
size_t dst_choose_idx = ndim;
// 用于存储分割维度信息
std::vector<SplitDim> split_dims;
// 维度选择循环
while (src_choose_idx > 0 && dst_choose_idx > 0) {
// 获取当前需要处理的维度索引
size_t src_idx = src_strides_desc_idx[src_choose_idx - 1];
size_t dst_idx = dst_choose_idx - 1;
if (src_idx == dst_idx) {
// 源和目标维度相同,可以一起处理
size_t idx = src_idx;
size_t len = shape[idx];
// 检查是否可以将此维度完全添加到block中
if (block_elements * len <= block_size) {
// 选择此维度
block_dim_choose[idx] = true;
block_elements *= len;
block_src_elements *= len;
block_dst_elements *= len;
src_choose_idx--;
dst_choose_idx--;
} else {
// 需要分割此维度
size_t num_per_block = block_size / block_elements;
// 确保num_per_block > 0且len >= num_per_block
if (num_per_block > 0 && len >= num_per_block && num_per_block > 1) {
size_t num_per_grid = (len + num_per_block - 1) / num_per_block; // 向上取整
SplitDim split_dim = {
idx, // choose_idx
num_per_block, // num_per_block
num_per_grid, // num_per_grid
0, // array_struct_idx_block (待更新)
0, // array_struct_idx_grid (待更新)
len // 原始维度长度
};
split_dims.push_back(split_dim);
}
break;
}
} else {
// 源和目标维度不同,需要分别处理
// 计算块比例
double src_div_dst = static_cast<double>(block_src_elements) / block_dst_elements;
double src_num_per_block = std::sqrt(block_size / (double)block_elements / src_div_dst);
double dst_num_per_block = src_num_per_block * src_div_dst;
size_t src_current_dim_len = shape[src_idx];
size_t dst_current_dim_len = shape[dst_idx];
if (static_cast<double>(src_current_dim_len) < src_num_per_block) {
// 源维度可以完全添加到block
block_dim_choose[src_idx] = true;
block_elements *= src_current_dim_len;
block_src_elements *= src_current_dim_len;
src_choose_idx--;
} else if (static_cast<double>(dst_current_dim_len) < dst_num_per_block) {
// 目标维度可以完全添加到block
block_dim_choose[dst_idx] = true;
block_elements *= dst_current_dim_len;
block_dst_elements *= dst_current_dim_len;
dst_choose_idx--;
} else {
// 需要分割源和目标维度
size_t src_num_per_block_int = static_cast<size_t>(std::floor(src_num_per_block));
size_t dst_num_per_block_int = static_cast<size_t>(std::floor(dst_num_per_block));
// 计算网格尺寸
size_t src_num_per_grid = (src_current_dim_len + src_num_per_block_int - 1) / src_num_per_block_int; // 向上取整
size_t dst_num_per_grid = (dst_current_dim_len + dst_num_per_block_int - 1) / dst_num_per_block_int; // 向上取整
// 处理源维度
if (src_num_per_block_int > 1) {
if (src_num_per_grid == 1) {
// 可以完全放入块
block_dim_choose[src_idx] = true;
block_elements *= src_current_dim_len;
block_src_elements *= src_current_dim_len;
src_choose_idx--;
} else {
// 需要分割
SplitDim split_dim = {
src_idx, // choose_idx
src_num_per_block_int, // num_per_block
src_num_per_grid, // num_per_grid
0, // array_struct_idx_block (待更新)
0, // array_struct_idx_grid (待更新)
src_current_dim_len // 原始维度长度
};
split_dims.push_back(split_dim);
}
}
// 处理目标维度
if (dst_num_per_block_int > 1) {
if (dst_num_per_grid == 1) {
// 可以完全放入块
block_dim_choose[dst_idx] = true;
block_elements *= dst_current_dim_len;
block_dst_elements *= dst_current_dim_len;
dst_choose_idx--;
} else {
// 需要分割
SplitDim split_dim = {
dst_idx, // choose_idx
dst_num_per_block_int, // num_per_block
dst_num_per_grid, // num_per_grid
0, // array_struct_idx_block (待更新)
0, // array_struct_idx_grid (待更新)
dst_current_dim_len // 原始维度长度
};
split_dims.push_back(split_dim);
}
}
break;
}
}
}
// 准备block维度相关参数
size_t block_dim = 0;
size_t block_len_total = 1;
std::vector<ARRAY_TYPE_SIZE> block_len;
std::vector<ARRAY_TYPE_STRIDE> src_block_stride;
std::vector<ARRAY_TYPE_STRIDE> dst_block_stride;
std::vector<ARRAY_TYPE_SIZE> grid_len;
std::vector<ARRAY_TYPE_STRIDE> src_grid_stride;
std::vector<ARRAY_TYPE_STRIDE> dst_grid_stride;
// 处理block维度,填充block_len和block_stride
for (size_t i = 0; i < ndim; ++i) {
if (block_dim_choose[i]) {
block_len.push_back(shape[i]);
src_block_stride.push_back(dims[i].src_stride);
dst_block_stride.push_back(dims[i].dst_stride);
block_dim += 1;
block_len_total *= shape[i];
}
// 处理分割维度的block部分
for (size_t j = 0; j < split_dims.size(); ++j) {
if (i == split_dims[j].choose_idx) {
block_len.push_back(split_dims[j].num_per_block);
src_block_stride.push_back(dims[i].src_stride);
dst_block_stride.push_back(dims[i].dst_stride);
split_dims[j].array_struct_idx_block = block_dim;
block_dim += 1;
block_len_total *= split_dims[j].num_per_block;
}
}
}
// 处理grid维度,填充grid_len和grid_stride
for (size_t i = 0; i < ndim; ++i) {
if (!block_dim_choose[i]) {
bool is_split = false;
// 检查是否是分割维度
for (size_t j = 0; j < split_dims.size(); ++j) {
if (i == split_dims[j].choose_idx) {
is_split = true;
grid_len.push_back(split_dims[j].num_per_grid);
src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block);
dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block);
split_dims[j].array_struct_idx_grid = grid_len.size() - 1;
}
}
// 如果不是分割维度,则作为完整的grid维度
if (!is_split) {
grid_len.push_back(shape[i]);
src_grid_stride.push_back(dims[i].src_stride);
dst_grid_stride.push_back(dims[i].dst_stride);
}
}
}
// 如果grid_len为空,添加一个默认值
if (grid_len.empty()) {
grid_len.push_back(1);
src_grid_stride.push_back(0);
dst_grid_stride.push_back(0);
}
// 处理约束条件 - 使用与Rust版本相似的逻辑
std::vector<Constraint<ARRAY_TYPE_SIZE>> constraints;
// 限制最多处理2个约束条件
for (size_t i = 0; i < split_dims.size(); ++i) {
if (split_dims[i].dim_len % split_dims[i].num_per_block == 0) {
continue;
}
Constraint<ARRAY_TYPE_SIZE> constraint;
constraint.grid_idx = split_dims[i].array_struct_idx_grid;
constraint.block_idx = split_dims[i].array_struct_idx_block;
constraint.grid_div_block = split_dims[i].num_per_block;
constraint.total_len = split_dims[i].dim_len;
constraints.push_back(constraint);
}
// 设置参数
params.block_dim = block_dim;
params.block_len_total = block_len_total;
params.block_len = block_len;
params.src_block_stride = src_block_stride;
params.dst_block_stride = dst_block_stride;
params.grid_len = grid_len;
params.src_grid_stride = src_grid_stride;
params.dst_grid_stride = dst_grid_stride;
params.constraints = constraints;
params.unit_size = unit;
return utils::Result<RearrangeParams>(params);
}
// 带约束的内核启动模板函数
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
void *y,
const void *x,
size_t grid_size,
const RearrangeParams &params,
size_t unit_size,
cudaStream_t stream) {
// 获取内核函数
RearrangeParams params_copy = params; // 创建一个非const副本
auto kernel_func_result = getRearrangeKernel(params_copy);
CHECK_RESULT(kernel_func_result);
auto kernel_func = kernel_func_result.take();
// 创建非const的临时变量
size_t block_dim = params.block_dim;
size_t block_len_total = params.block_len_total;
// 检查向量尺寸是否合理
if (params.block_len.size() < block_dim || params.src_block_stride.size() < block_dim || params.dst_block_stride.size() < block_dim) {
return INFINI_STATUS_BAD_PARAM;
}
if (params.grid_len.empty() || params.src_grid_stride.empty() || params.dst_grid_stride.empty()) {
return INFINI_STATUS_BAD_PARAM;
}
const Constraint<ARRAY_TYPE_SIZE> *constraints_data;
auto empty_constraints = Constraint<ARRAY_TYPE_SIZE>();
if (params.constraints.empty()) {
constraints_data = &empty_constraints;
} else {
constraints_data = params.constraints.data();
}
void *args[]
= {
&y, &x,
&block_dim,
&block_len_total,
const_cast<void *>(static_cast<const void *>(params.block_len.data())),
const_cast<void *>(static_cast<const void *>(params.src_block_stride.data())),
const_cast<void *>(static_cast<const void *>(params.dst_block_stride.data())),
const_cast<void *>(static_cast<const void *>(params.grid_len.data())),
const_cast<void *>(static_cast<const void *>(params.src_grid_stride.data())),
const_cast<void *>(static_cast<const void *>(params.dst_grid_stride.data())),
const_cast<void *>(static_cast<const void *>(constraints_data))};
CHECK_OR_RETURN(cudaLaunchKernel(
kernel_func,
grid_size, BLOCK_SIZE,
args, 0, stream)
== cudaSuccess,
INFINI_STATUS_INTERNAL_ERROR);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *y,
const void *x,
void *stream) const {
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
// 如果没有维度,直接进行内存拷贝
if (_meta.ndim() == 0) {
auto err = cudaMemcpyAsync(y, x, _meta.unit(), cudaMemcpyDeviceToDevice, cuda_stream);
if (err != cudaSuccess) {
return INFINI_STATUS_INTERNAL_ERROR;
}
CHECK_OR_RETURN(cudaMemcpyAsync(y, x, _meta.unit(), cudaMemcpyDeviceToDevice, cuda_stream) == cudaSuccess,
INFINI_STATUS_INTERNAL_ERROR);
return INFINI_STATUS_SUCCESS;
}
// 获取设备属性
int max_threads = _opaque->internal->maxThreadsPerBlock();
// 准备参数
auto params_result = prepareRearrangeParams(_meta, std::min(CUDA_BLOCK_SIZE_1024, max_threads));
CHECK_RESULT(params_result);
auto params = params_result.take();
// 计算grid大小
size_t grid_size = 1;
for (size_t i = 0; i < params.grid_len.size(); ++i) {
grid_size *= params.grid_len[i];
}
// 检查grid大小是否为0
if (grid_size == 0) {
return INFINI_STATUS_BAD_PARAM;
}
// 根据设备属性选择合适的内核
infiniStatus_t status = INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
size_t block_size = params.block_len_total;
if (block_size <= CUDA_BLOCK_SIZE_512) {
status = launchKernel<CUDA_BLOCK_SIZE_512>(y, x, grid_size, params, _meta.unit(), cuda_stream);
} else if (block_size <= CUDA_BLOCK_SIZE_1024) {
status = launchKernel<CUDA_BLOCK_SIZE_1024>(y, x, grid_size, params, _meta.unit(), cuda_stream);
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return status;
}
} // namespace op::rearrange::cuda
#ifndef __REARRANGE_CUDA_H__
#define __REARRANGE_CUDA_H__
#include "../rearrange.h"
DESCRIPTOR(cuda)
#endif // __REARRANGE_CUDA_H__
#ifndef __REARRANGE_CUDA_KERNEL_H__
#define __REARRANGE_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_common.cuh"
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
#define MAX_BLOCK_ARRAY_SIZE 5
#define MAX_GRID_ARRAY_SIZE 5
template <int ArrSize, typename ArrayType>
struct ArrayStruct {
ArrayType a[ArrSize];
};
// 各个元素分别代表:[grid_idx, block_idx, grid的stride相对于block的倍数,总的len限制]
template <typename ElementType>
struct Constraint {
ElementType grid_idx;
ElementType block_idx;
ElementType grid_div_block;
ElementType total_len;
};
#define IF_CONSTRAINT_0 , const ArrayStruct<1, Constraint<ARRAY_TYPE_SIZE>> constraints
#define IF_CONSTRAINT_1 , const ArrayStruct<1, Constraint<ARRAY_TYPE_SIZE>> constraints
#define IF_CONSTRAINT_2 , const ArrayStruct<2, Constraint<ARRAY_TYPE_SIZE>> constraints
// 定义宏生成内核函数
#define DEFINE_REARRANGE_KERNEL(Tmem_type, constraint_num, block_array_size, grid_array_size) \
extern "C" __global__ void rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num( \
void *__restrict__ dst, \
const void *__restrict__ src, \
const size_t block_dim, \
const size_t block_len_total, \
const ArrayStruct<block_array_size, ARRAY_TYPE_SIZE> block_len, \
const ArrayStruct<block_array_size, ARRAY_TYPE_STRIDE> src_block_stride, /* 字节单位的步长 */ \
const ArrayStruct<block_array_size, ARRAY_TYPE_STRIDE> dst_block_stride, /* 字节单位的步长 */ \
const ArrayStruct<grid_array_size, ARRAY_TYPE_SIZE> grid_len, \
const ArrayStruct<grid_array_size, ARRAY_TYPE_STRIDE> src_grid_stride, /* 字节单位的步长 */ \
const ArrayStruct<grid_array_size, ARRAY_TYPE_STRIDE> dst_grid_stride /* 字节单位的步长 */ \
IF_CONSTRAINT_##constraint_num) { \
size_t remaining = threadIdx.x; \
if (remaining >= block_len_total) { \
return; \
} \
\
/* 声明共享内存 */ \
__shared__ ptrdiff_t shared_src_offset; \
__shared__ ptrdiff_t shared_dst_offset; \
\
if (constraint_num > 0) { \
__shared__ ARRAY_TYPE_SIZE shared_constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
\
if (threadIdx.x == 0) { /* 只让0号线程计算 */ \
/* 计算当前block处理的数据在src和dst中的基础偏移(bytes) */ \
ptrdiff_t src_offset = 0; \
ptrdiff_t dst_offset = 0; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
\
size_t remaining \
= blockIdx.x; \
\
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].grid_idx) { \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
} \
} \
} \
} \
\
/* 将结果存入共享内存 */ \
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
for (ssize_t j = 0; j < constraint_num; j++) { \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
} \
} \
\
/* 确保所有线程都能看到共享内存中的值 */ \
__syncthreads(); \
\
/* 所有线程直接使用计算好的偏移值 */ \
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
for (ssize_t j = 0; j < constraint_num; j++) { \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
} \
\
for (ssize_t i = block_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */ \
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
return; \
} \
} \
} \
} \
} \
\
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (0 == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
return; \
} \
} \
} \
\
/* 执行数据拷贝,注意offset已经是字节偏移 */ \
*reinterpret_cast<Tmem_type *>(reinterpret_cast<char *>(dst) + dst_offset) = *reinterpret_cast<const Tmem_type *>(reinterpret_cast<const char *>(src) + src_offset); \
\
} else { \
if (threadIdx.x == 0) { /* 只让0号线程计算 */ \
/* 计算当前block处理的数据在src和dst中的基础偏移(bytes) */ \
ptrdiff_t src_offset = 0; \
ptrdiff_t dst_offset = 0; \
size_t remaining = blockIdx.x; \
\
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
} \
\
/* 将结果存入共享内存 */ \
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
} \
\
/* 确保所有线程都能看到共享内存中的值 */ \
__syncthreads(); \
\
/* 所有线程直接使用计算好的偏移值 */ \
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
\
for (ssize_t i = block_array_size - 1; i > 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */ \
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
} \
\
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
\
/* 执行数据拷贝,注意offset已经是字节偏移 */ \
*reinterpret_cast<Tmem_type *>(reinterpret_cast<char *>(dst) + dst_offset) = *reinterpret_cast<const Tmem_type *>(reinterpret_cast<const char *>(src) + src_offset); \
} \
}
// 定义支持的约束条件数量组合
#define DEFINE_KERNELS_BY_CONSTRAINT(block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(0, block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(1, block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(2, block_array_size, grid_array_size)
// 定义支持的类型
#define DEFINE_KERNELS_BY_TYPE(constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(uchar1, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(uchar2, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float1, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float2, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float4, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
// 为1-5和1-5的所有组合生成内核
DEFINE_KERNELS_BY_CONSTRAINT(1, 1)
DEFINE_KERNELS_BY_CONSTRAINT(1, 2)
DEFINE_KERNELS_BY_CONSTRAINT(1, 3)
DEFINE_KERNELS_BY_CONSTRAINT(1, 4)
DEFINE_KERNELS_BY_CONSTRAINT(1, 5)
DEFINE_KERNELS_BY_CONSTRAINT(2, 1)
DEFINE_KERNELS_BY_CONSTRAINT(2, 2)
DEFINE_KERNELS_BY_CONSTRAINT(2, 3)
DEFINE_KERNELS_BY_CONSTRAINT(2, 4)
DEFINE_KERNELS_BY_CONSTRAINT(2, 5)
DEFINE_KERNELS_BY_CONSTRAINT(3, 1)
DEFINE_KERNELS_BY_CONSTRAINT(3, 2)
DEFINE_KERNELS_BY_CONSTRAINT(3, 3)
DEFINE_KERNELS_BY_CONSTRAINT(3, 4)
DEFINE_KERNELS_BY_CONSTRAINT(3, 5)
DEFINE_KERNELS_BY_CONSTRAINT(4, 1)
DEFINE_KERNELS_BY_CONSTRAINT(4, 2)
DEFINE_KERNELS_BY_CONSTRAINT(4, 3)
DEFINE_KERNELS_BY_CONSTRAINT(4, 4)
DEFINE_KERNELS_BY_CONSTRAINT(4, 5)
DEFINE_KERNELS_BY_CONSTRAINT(5, 1)
DEFINE_KERNELS_BY_CONSTRAINT(5, 2)
DEFINE_KERNELS_BY_CONSTRAINT(5, 3)
DEFINE_KERNELS_BY_CONSTRAINT(5, 4)
DEFINE_KERNELS_BY_CONSTRAINT(5, 5)
// 准备参数结构体
struct RearrangeParams {
std::vector<ARRAY_TYPE_SIZE> block_len;
std::vector<ARRAY_TYPE_STRIDE> src_block_stride;
std::vector<ARRAY_TYPE_STRIDE> dst_block_stride;
std::vector<ARRAY_TYPE_SIZE> grid_len;
std::vector<ARRAY_TYPE_STRIDE> src_grid_stride;
std::vector<ARRAY_TYPE_STRIDE> dst_grid_stride;
size_t block_dim;
size_t block_len_total;
std::vector<Constraint<ARRAY_TYPE_SIZE>> constraints;
size_t unit_size;
};
utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
auto grid_num = params.grid_len.size();
auto block_num = params.block_len.size();
auto constraint_num = params.constraints.size();
auto unit_size = params.unit_size;
CHECK_OR_RETURN(grid_num <= MAX_GRID_ARRAY_SIZE && grid_num != 0, INFINI_STATUS_BAD_PARAM);
CHECK_OR_RETURN(block_num <= MAX_BLOCK_ARRAY_SIZE && block_num != 0, INFINI_STATUS_BAD_PARAM);
CHECK_OR_RETURN(constraint_num <= 2, INFINI_STATUS_BAD_PARAM);
auto block_len = params.block_len.data();
auto src_block_stride = params.src_block_stride.data();
auto dst_block_stride = params.dst_block_stride.data();
auto grid_len = params.grid_len.data();
auto src_grid_stride = params.src_grid_stride.data();
auto dst_grid_stride = params.dst_grid_stride.data();
auto constrain = params.constraints.data();
void *kernel_func = nullptr;
#define GET_REARRANGE_KERNEL(Tmem_type, block_array_size, grid_array_size, constraint_num) \
kernel_func = (void *)rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num;
#define GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, constraint_num) \
switch (unit_size) { \
case 1: \
GET_REARRANGE_KERNEL(uchar1, block_array_size, grid_array_size, constraint_num); \
break; \
case 2: \
GET_REARRANGE_KERNEL(uchar2, block_array_size, grid_array_size, constraint_num); \
break; \
case 4: \
GET_REARRANGE_KERNEL(float1, block_array_size, grid_array_size, constraint_num); \
break; \
case 8: \
GET_REARRANGE_KERNEL(float2, block_array_size, grid_array_size, constraint_num); \
break; \
case 16: \
GET_REARRANGE_KERNEL(float4, block_array_size, grid_array_size, constraint_num); \
break; \
case 32: \
GET_REARRANGE_KERNEL(double4, block_array_size, grid_array_size, constraint_num); \
break; \
default: \
return INFINI_STATUS_BAD_PARAM; \
}
#define GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, grid_array_size) \
switch (constraint_num) { \
case 0: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 0); \
break; \
case 1: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 2); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_GRID_NUM(block_array_size) \
switch (grid_num) { \
case 1: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 2); \
break; \
case 3: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 3); \
break; \
case 4: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 4); \
break; \
case 5: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
switch (block_num) { \
case 1: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(2); \
break; \
case 3: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(3); \
break; \
case 4: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(4); \
break; \
case 5: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
break; \
}
GET_REARRANGE_KERNEL_BY_BLOCK_NUM
return utils::Result<void *>(kernel_func);
}
#endif // __REARRANGE_CUDA_KERNEL_H__
......@@ -5,6 +5,13 @@
#ifdef ENABLE_CPU_API
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateRearrangeDescriptor(
infiniopHandle_t handle,
......@@ -25,6 +32,13 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -49,6 +63,13 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -70,6 +91,13 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -5,18 +5,19 @@
namespace op::rms_norm::ascend {
struct Descriptor::Opaque {
mutable aclOpExecutor *executor;
aclnnTensorDescriptor_t y;
aclnnTensorDescriptor_t x;
aclnnTensorDescriptor_t w;
aclnnTensorDescriptor_t rstd;
size_t workspaceSize;
aclOpExecutor *executor;
~Opaque() {
delete y;
delete x;
delete w;
delete rstd;
aclDestroyAclOpExecutor(executor);
}
};
......@@ -69,7 +70,7 @@ infiniStatus_t Descriptor::create(
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
*desc_ptr = new Descriptor(
new Opaque{executor, y, x, w, rstd, workspace_size},
new Opaque{y, x, w, rstd, workspace_size, executor},
std::move(info),
all_workspace_size,
handle_ascend->device, handle_ascend->device_id);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment