Commit 9b32b4b1 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/150

parents 15bcbdfc 4799ddbf
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/clip.h"
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/clip_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateClipDescriptor(
infiniopHandle_t handle,
infiniopClipDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t min_val,
infiniopTensorDescriptor_t max_val) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::clip::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::clip::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
{x, min_val, max_val})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::clip::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#endif
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopClip(
infiniopClipDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *min_val,
const void *max_val,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x, min_val, max_val}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
......@@ -3,6 +3,26 @@
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h>
#include <cstring>
#include <unordered_map>
// Custom hash function for alpha beta pair<float, float>
struct FloatPairHash {
size_t operator()(const std::pair<float, float> &p) const {
uint64_t combined;
std::memcpy(reinterpret_cast<char *>(&combined), &p.first, sizeof(float));
std::memcpy(reinterpret_cast<char *>(&combined) + sizeof(float), &p.second, sizeof(float));
return std::hash<uint64_t>()(combined);
}
};
struct FloatPairEqual {
bool operator()(const std::pair<float, float> &a, const std::pair<float, float> &b) const {
return a.first == b.first && a.second == b.second;
}
};
namespace op::gemm::ascend {
struct Descriptor::Opaque {
......@@ -11,11 +31,17 @@ struct Descriptor::Opaque {
// see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t mt;
// alpha&beta hashmap
std::unordered_map<std::pair<float, float>, aclOpExecutor *, FloatPairHash, FloatPairEqual> lookup;
~Opaque() {
delete c;
delete a;
delete b;
for (auto &item : lookup) {
aclDestroyAclOpExecutor(item.second);
}
lookup.clear();
}
};
......@@ -54,15 +80,16 @@ infiniStatus_t Descriptor::create(
ta = a->tensor,
tb = b->tensor;
std::unordered_map<std::pair<float, float>, aclOpExecutor *, FloatPairHash, FloatPairEqual> lookup;
aclOpExecutor *executor = nullptr;
size_t workspace_size = 0;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
int8_t mt = 1;
CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspace_size, &executor));
CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, 1., 0., 0, 0, tc, mt, &workspace_size, &executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
lookup[std::make_pair(1.0f, 0.0f)] = executor;
CHECK_ACL(aclnnGemmGetWorkspaceSize(ta, tb, tc, 1., 1., 0, 0, tc, mt, &workspace_size, &executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
lookup[std::make_pair(1.0f, 1.0f)] = executor;
*desc_ptr = new Descriptor(
dtype, info, workspace_size,
......@@ -71,11 +98,9 @@ infiniStatus_t Descriptor::create(
a,
b,
mt,
},
std::move(lookup)},
handle->device, handle->device_id);
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
......@@ -93,16 +118,22 @@ infiniStatus_t Descriptor::calculate(
ta = _opaque->a->tensor,
tb = _opaque->b->tensor;
size_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
size_t workspace_size = _workspace_size;
aclOpExecutor *executor;
auto key = std::make_pair(alpha, beta);
if (_opaque->lookup.find(key) != _opaque->lookup.end()) {
executor = _opaque->lookup[key];
} else {
CHECK_ACL(aclnnGemmGetWorkspaceSize(
ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
&workspace_size, &executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
_opaque->lookup[key] = executor;
}
CHECK_ACL(aclnnGemmGetWorkspaceSize(
ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
&workspace_size, &executor));
if (workspaceSize_ < workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
auto unit = infiniSizeOf(_dtype);
for (size_t i = 0; i < _info.batch; ++i) {
......
......@@ -62,7 +62,7 @@ infiniStatus_t calculate(
(kunlunStream_t)stream,
[&](xdnnHandle_t handle) {
for (size_t i = 0; i < info.batch; i++) {
CHECK_XDNN((xdnn::fc_fusion<Tdata, Tdata, Tdata, int16_t>(
CHECK_KUNLUN((xdnn::fc_fusion<Tdata, Tdata, Tdata, int16_t>(
handle,
(Tdata *)((char *)a + i * info.a_matrix.stride * unit),
(Tdata *)((char *)b + i * info.b_matrix.stride * unit),
......
#include "random_sample_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../devices/cpu/cpu_handle.h"
#include "../../../tensor.h"
#include "../info.h"
#include "infinicore.h"
#include <algorithm>
namespace op::random_sample::cpu {
......@@ -15,29 +15,14 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dt_i = result_desc->dtype();
auto dt_p = probs_desc->dtype();
CHECK_DTYPE(dt_i,
INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_API_OR(result_desc->ndim(), 0,
return INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_API_OR(probs_desc->ndim(), 1,
return INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_API_OR(probs_desc->stride(0), 1,
return INFINI_STATUS_BAD_TENSOR_STRIDES);
auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(
dt_i,
dt_p,
probs_desc->dim(0),
result.take(),
0,
nullptr,
handle->device,
handle->device_id);
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
......@@ -55,36 +40,42 @@ struct ComputeType<fp16_t> {
using type = float;
};
template <class Tidx, class Tval>
struct Scheme {
using Tcompute = typename ComputeType<Tval>::type;
struct Algo {
static Tcompute get(void const *ptr, size_t i) {
return utils::cast<Tcompute, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
template <class Tidx, class Tval>
static auto get(void const *ptr, size_t i) {
return utils::cast<typename ComputeType<Tval>::type, Tval>(reinterpret_cast<Tval const *>(ptr)[i]);
}
static void argmax(
void *result, void const *probs, size_t n) {
template <class Tidx, class Tval>
infiniStatus_t argmax(
void *workspace, size_t workspace_size,
void *result, void const *probs, size_t n,
void *stream) {
auto idx = reinterpret_cast<Tidx *>(result);
*idx = 0;
auto max_val = get(probs, 0);
auto max_val = get<Tidx, Tval>(probs, 0);
for (size_t i = 0; i < n; i++) {
if (auto val = get(probs, i); val > max_val) {
if (auto val = get<Tidx, Tval>(probs, i); val > max_val) {
max_val = val;
*idx = static_cast<Tidx>(i);
}
}
return INFINI_STATUS_SUCCESS;
}
static void random(
template <class Tidx, class Tval>
infiniStatus_t random(
void *workspace, size_t workspace_size,
void *result, void const *probs, size_t n,
float random_val, float topp, int topk, float temperature) {
float random_val, float topp, int topk, float temperature,
void *stream) {
struct KVPair {
Tidx idx;
Tcompute val;
typename ComputeType<Tval>::type val;
bool operator<(const KVPair &other) const {
return val > other.val;
......@@ -95,7 +86,7 @@ struct Scheme {
// build & sort
std::vector<KVPair> pairs(n);
for (size_t i = 0; i < n; i++) {
pairs[i] = {static_cast<Tidx>(i), get(probs, i)};
pairs[i] = {static_cast<Tidx>(i), get<Tidx, Tval>(probs, i)};
}
std::sort(pairs.begin(), pairs.end());
// softmax & sum
......@@ -115,68 +106,10 @@ struct Scheme {
break;
}
}
}
};
template <class Tidx, class Tval>
void switch_f(
size_t n,
void *result, const void *probs,
float random_val, float topp, int topk, float temperature) {
if (random_val == 0 || topp == 0 || topk == 1 || temperature == 0) {
Scheme<Tidx, Tval>::argmax(result, probs, n);
} else {
Scheme<Tidx, Tval>::random(result, probs, n, random_val, topp, topk, temperature);
}
}
template <class Tidx>
void switch_val(
infiniDtype_t dt_p, size_t n,
void *result, void const *probs,
float random_val, float topp, int topk, float temperature) {
switch (dt_p) {
case INFINI_DTYPE_F16:
switch_f<Tidx, fp16_t>(n, result, probs, random_val, topp, topk, temperature);
break;
case INFINI_DTYPE_F32:
switch_f<Tidx, float>(n, result, probs, random_val, topp, topk, temperature);
break;
case INFINI_DTYPE_F64:
switch_f<Tidx, double>(n, result, probs, random_val, topp, topk, temperature);
break;
default:
// unreachable
std::abort();
}
}
void switch_idx(
infiniDtype_t dt_i, infiniDtype_t dt_p, size_t n,
void *result, void const *probs,
float random_val, float topp, int topk, float temperature) {
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>(dt_p, n, result, probs, random_val, topp, topk, temperature); \
break
switch (dt_i) {
CASE(INFINI_DTYPE_I8, int8_t);
CASE(INFINI_DTYPE_I16, int16_t);
CASE(INFINI_DTYPE_I32, int32_t);
CASE(INFINI_DTYPE_I64, int64_t);
CASE(INFINI_DTYPE_U8, uint8_t);
CASE(INFINI_DTYPE_U16, uint16_t);
CASE(INFINI_DTYPE_U32, uint32_t);
CASE(INFINI_DTYPE_U64, uint64_t);
default:
// unreachable
std::abort();
return INFINI_STATUS_SUCCESS;
}
#undef CASE
}
};
infiniStatus_t Descriptor::calculate(
void *workspace,
......@@ -189,7 +122,11 @@ infiniStatus_t Descriptor::calculate(
float temperature,
void *stream) const {
switch_idx(_dt_i, _dt_p, _n, result, probs, random_val, topp, topk, temperature);
Calculate::calculate<Algo>(
Algo{}, _info, workspace, workspace_size,
result, probs,
random_val, topp, topk, temperature,
stream);
return INFINI_STATUS_SUCCESS;
}
......
#include "../../../devices/cuda/cuda_handle.cuh"
#include "../info.h"
#include "random_sample_cuda.cuh"
#include "random_sample_kernel.cuh"
namespace op::random_sample::cuda {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size;
#define CASE_P(CASE, Tidx, Tval) \
case CASE: { \
auto workspace_result = calculateWorkspace<Tidx, Tval>(info.n); \
CHECK_RESULT(workspace_result); \
workspace_size = workspace_result.take(); \
} break
#define CASE_I(CASE, Tidx) \
case CASE: \
switch (info.dt_p) { \
CASE_P(INFINI_DTYPE_F16, Tidx, half); \
CASE_P(INFINI_DTYPE_F32, Tidx, float); \
CASE_P(INFINI_DTYPE_F64, Tidx, double); \
default: \
abort(); \
} \
break
switch (info.dt_i) {
CASE_I(INFINI_DTYPE_I8, int8_t);
CASE_I(INFINI_DTYPE_I16, int16_t);
CASE_I(INFINI_DTYPE_I32, int32_t);
CASE_I(INFINI_DTYPE_I64, int64_t);
CASE_I(INFINI_DTYPE_U8, uint8_t);
CASE_I(INFINI_DTYPE_U16, uint16_t);
CASE_I(INFINI_DTYPE_U32, uint32_t);
CASE_I(INFINI_DTYPE_U64, uint64_t);
default:
abort();
}
#undef CASE_I
#undef CASE_P
*desc_ptr = new Descriptor(
info,
workspace_size,
new Opaque{handle->internal()},
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
size_t Descriptor::minWorkspaceSize() const {
return _min_workspace_size;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *result,
const void *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream) const {
if (workspace_size < _min_workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
auto block_size = _opaque->internal->blockSizeX();
Calculate::calculate<Algo>(
Algo{block_size}, _info, workspace, workspace_size,
result, probs,
random_val, topp, topk, temperature,
stream);
return INFINI_STATUS_SUCCESS;
}
} // namespace op::random_sample::cuda
#ifndef __RANDOM_SAMPLE_CUDA_CUH__
#define __RANDOM_SAMPLE_CUDA_CUH__
#include "../random_sample.h"
DESCRIPTOR(cuda)
#endif // __RANDOM_SAMPLE_CUDA_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "infinicore.h"
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh>
namespace op::random_sample::cuda {
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
template <class T>
static cudaError argMax_(
cub::KeyValuePair<int, T> *kv_pair,
const T *logits,
int n,
void *workspace_ptr,
size_t &workspace_len,
cudaStream_t stream) {
return cub::DeviceReduce::ArgMax(
workspace_ptr, workspace_len,
logits, kv_pair, n,
stream);
}
template <class Tval, class Tidx>
static cudaError radixSort(
void *workspace_ptr, size_t &workspace_len,
const Tval *key_in, Tval *key_out,
const Tidx *val_in, Tidx *val_out,
int n,
cudaStream_t stream) {
return cub::DeviceRadixSort::SortPairsDescending(
workspace_ptr, workspace_len,
key_in, key_out,
val_in, val_out,
n,
0, sizeof(Tval) * 8,
stream);
}
template <class T>
static cudaError inclusiveSum(
void *workspace_ptr, size_t &workspace_len,
T *data, int n,
cudaStream_t stream) {
return cub::DeviceScan::InclusiveSum(
workspace_ptr, workspace_len,
data, data, n,
stream);
}
// ↑↑↑ 重新封装 cub api,减少模板参数,方便调用
// ↓↓↓ 计算 workspace
// 地址对齐到 256
static constexpr size_t align256(size_t size) {
return (size + 255) & (~255);
}
template <class Tidx, class Tval>
utils::Result<size_t> calculateWorkspace(size_t n_) {
const auto n = static_cast<int>(n_);
size_t argmax;
CHECK_CUDA(argMax_<Tval>(
nullptr, nullptr, n,
nullptr, argmax,
nullptr));
// 前 256 字节用于 kv pair
argmax += 256;
// indices
size_t size_random = align256(sizeof(Tidx) * n);
// sorted
size_random += align256(sizeof(Tval) * n);
// indices_out
size_random += align256(sizeof(Tidx) * n);
// cub device api
size_t size_radix_sort;
CHECK_CUDA((radixSort<Tval, Tidx>(
nullptr, size_radix_sort,
nullptr, nullptr,
nullptr, nullptr,
n,
nullptr)));
size_t size_inclusive_sum;
CHECK_CUDA(inclusiveSum<Tval>(
nullptr, size_inclusive_sum,
nullptr, n,
nullptr));
size_random += cub::Max()(size_radix_sort, size_inclusive_sum);
return utils::Result<size_t>(cub::Max()(argmax, size_random));
}
// ↑↑↑ 计算 workspace
// ↓↓↓ 通过特化将 fp16_t 转换为 half
template <class Tval>
struct CudaTval {
using Type = Tval;
};
template <>
struct CudaTval<fp16_t> {
using Type = half;
};
// ↑↑↑ 通过特化将 fp16_t 转换为 half
// ↓↓↓ 用于采样过程的小型 kernel
// cuda toolkit 11.x 带的 cub::DeviceReduce::ArgMax 只接受 cub::KeyValuePair<int, Tval> 输出。
// 这个 kernel 用于取出序号
template <class Tidx, class Tval>
static __global__ void castIdx(Tidx *result, const cub::KeyValuePair<int, Tval> *kv_pair) {
*result = kv_pair->key;
}
// 填充排序要求的序号数组
template <class Tidx>
static __global__ void fillIndices(Tidx *indices, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
indices[i] = i;
}
}
// random sample 使用的 softmax 可以简化为一个基本的线性映射
// 由于已经排序,最大值就是第一个数字
// 第一个数字需要被多个 block 读取,不能写
template <class T>
static __global__ void partialSoftmaxKernel(
T *__restrict__ data, int n,
float temperature) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (0 < i && i < n) {
float max = __ldg(data);
data[i] = (T)expf(((float)data[i] - max) / temperature);
}
}
// 将第一个数字写成 1,即 exp(0)
template <class T>
static __global__ void setSoftmaxMaxKernel(
T *__restrict__ data) {
*data = 1;
}
// 直接 for 循环遍历采样
// 这个 kernel 仅用于避免将数据拷贝到 cpu
template <class Tval, class Tidx>
static __global__ void randomSampleKernel(
Tidx *__restrict__ result,
const Tval *__restrict__ sorted,
const Tidx *__restrict__ indices_out,
size_t n,
float random, float topp, size_t topk) {
topk = cub::Min()(topk, n);
auto p = (Tval)(random * cub::Min()(topp * (float)sorted[n - 1], (float)sorted[topk - 1]));
for (size_t i = 0;; ++i) {
if ((sorted[i]) >= p) {
*result = indices_out[i];
return;
}
}
}
// ↑↑↑ 用于采样过程的小型 kernel
struct Algo {
int block_size;
template <class Tidx, class Tval_>
infiniStatus_t argmax(
void *workspace, size_t workspace_size,
void *result, const void *probs, size_t n,
void *stream_) const {
using Tval = typename CudaTval<Tval_>::Type;
auto stream = (cudaStream_t)stream_;
auto logits = (Tval *)probs;
auto kv_pair = (cub::KeyValuePair<int, Tval> *)workspace;
workspace = (void *)((char *)workspace + 256);
workspace_size -= 256;
argMax_(
kv_pair,
logits,
n,
workspace,
workspace_size, stream);
castIdx<<<1, 1, 0, stream>>>((Tidx *)result, kv_pair);
return INFINI_STATUS_SUCCESS;
}
template <class Tidx, class Tval_>
infiniStatus_t random(
void *workspace_, size_t workspace_size,
void *result_, const void *probs, size_t n,
float random_val, float topp, int topk, float temperature,
void *stream_) const {
using Tval = typename CudaTval<Tval_>::Type;
auto stream = (cudaStream_t)stream_;
auto logits = (Tval *)probs;
auto result = (Tidx *)result_;
auto workspace = reinterpret_cast<size_t>(workspace_);
auto workspace_end = workspace + workspace_size;
auto indices = reinterpret_cast<Tidx *>(workspace);
workspace += align256(sizeof(Tidx) * n);
auto sorted = reinterpret_cast<Tval *>(workspace);
workspace += align256(sizeof(Tval) * n);
auto indices_out = reinterpret_cast<Tidx *>(workspace);
workspace += align256(sizeof(Tidx) * n);
workspace_ = reinterpret_cast<void *>(workspace);
workspace_size = workspace_end - workspace;
auto block = cub::Min()((size_t)block_size, n);
auto grid = (n + block - 1) / block;
// sort
fillIndices<<<grid, block, 0, stream>>>(indices, n);
CHECK_CUDA(radixSort(
workspace_, workspace_size,
logits, sorted,
indices, indices_out,
n,
stream));
// softmax
partialSoftmaxKernel<<<grid, block, 0, stream>>>(sorted, n, temperature);
setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted);
// sum
CHECK_CUDA(inclusiveSum(
workspace_, workspace,
sorted, n,
stream));
// sample
randomSampleKernel<<<1, 1, 0, stream>>>(
result,
sorted, indices_out, n,
random_val, topp, topk);
return INFINI_STATUS_SUCCESS;
}
};
} // namespace op::random_sample::cuda
#ifndef __RANDOM_SAMPLE_INFO_H__
#define __RANDOM_SAMPLE_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
namespace op::random_sample {
struct RandomSampleInfo {
infiniDtype_t dt_i, dt_p;
size_t n;
static utils::Result<RandomSampleInfo> create(
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) {
auto dt_i = result_desc->dtype();
auto dt_p = probs_desc->dtype();
CHECK_DTYPE_ANY_INT(dt_i);
CHECK_DTYPE(dt_p, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_OR_RETURN(result_desc->ndim() == 0, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(probs_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(probs_desc->stride(0) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
return utils::Result<RandomSampleInfo>({dt_i, dt_p, probs_desc->dim(0)});
}
};
} // namespace op::random_sample
#endif // __RANDOM_SAMPLE_INFO_H__
......@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/random_sample_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/random_sample_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateRandomSampleDescriptor(
infiniopHandle_t handle,
......@@ -25,6 +28,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -38,9 +44,10 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
case CASE: { \
using Ptr = const op::random_sample::NAMESPACE::Descriptor *; \
*size = reinterpret_cast<Ptr>(desc)->minWorkspaceSize(); \
} \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
......@@ -48,6 +55,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -82,6 +92,9 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -103,6 +116,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __RANDOM_SAMPLE_H__
#define __RANDOM_SAMPLE_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
......@@ -11,22 +11,18 @@
struct Opaque; \
Opaque *_opaque; \
\
infiniDtype_t _dt_i, _dt_p; \
size_t _n, _min_workspace_size; \
RandomSampleInfo _info; \
size_t _min_workspace_size; \
\
Descriptor( \
infiniDtype_t dt_i, \
infiniDtype_t dt_p, \
size_t n, \
RandomSampleInfo info, \
size_t min_workspace_size, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dt_i(dt_i), \
_dt_p(dt_p), \
_n(n), \
_info(info), \
_min_workspace_size(min_workspace_size) {} \
\
public: \
......@@ -53,4 +49,96 @@
}; \
}
namespace op::random_sample {
struct CalculateArgs {
void *workspace;
size_t workspace_size;
void *result;
const void *probs;
float random_val, topp, temperature;
int topk;
void *stream;
};
class Calculate {
template <class Tidx, class Tval, class Algo>
static void switch_f(Algo algo, size_t n, CalculateArgs args) {
if (args.random_val == 0 || args.topp == 0 || args.topk == 1 || args.temperature == 0) {
algo.template argmax<Tidx, Tval>(
args.workspace, args.workspace_size,
args.result, args.probs, n,
args.stream);
} else {
algo.template random<Tidx, Tval>(
args.workspace, args.workspace_size,
args.result, args.probs, n,
args.random_val, args.topp, args.topk, args.temperature,
args.stream);
}
}
template <class Tidx, class Algo>
static void switch_val(
Algo algo,
infiniDtype_t dt_p, size_t n, CalculateArgs args) {
switch (dt_p) {
case INFINI_DTYPE_F16:
switch_f<Tidx, fp16_t>(algo, n, args);
break;
case INFINI_DTYPE_F32:
switch_f<Tidx, float>(algo, n, args);
break;
case INFINI_DTYPE_F64:
switch_f<Tidx, double>(algo, n, args);
break;
default:
// unreachable
std::abort();
}
}
public:
template <class Algo>
static infiniStatus_t calculate(
Algo algo,
RandomSampleInfo info,
void *workspace, size_t workspace_size,
void *result, const void *probs,
float random_val, float topp, int topk, float temperature,
void *stream) {
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>( \
algo, info.dt_p, info.n, \
{workspace, workspace_size, \
result, probs, \
random_val, topp, temperature, topk, \
stream}); \
break
switch (info.dt_i) {
CASE(INFINI_DTYPE_I8, int8_t);
CASE(INFINI_DTYPE_I16, int16_t);
CASE(INFINI_DTYPE_I32, int32_t);
CASE(INFINI_DTYPE_I64, int64_t);
CASE(INFINI_DTYPE_U8, uint8_t);
CASE(INFINI_DTYPE_U16, uint16_t);
CASE(INFINI_DTYPE_U32, uint32_t);
CASE(INFINI_DTYPE_U64, uint64_t);
default:
// unreachable
std::abort();
}
#undef CASE
return INFINI_STATUS_SUCCESS;
}
};
} // namespace op::random_sample
#endif // __RANDOM_SAMPLE_H__
#include "rearrange_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_copy.h>
namespace op::rearrange::ascend {
struct Descriptor::Opaque {
aclnnTensorDescriptor_t dst;
aclnnTensorDescriptor_t src;
void *workspace; // aclnnInplaceCopy workspace
uint64_t workspace_size;
~Opaque() {
delete dst;
delete src;
aclrtFree(workspace);
}
};
Descriptor::~Descriptor() {
delete _opaque;
};
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
auto dtype = y_desc->dtype();
auto ndim = y_desc->ndim();
auto shape = y_desc->shape();
CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < ndim; ++i) {
CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
}
auto dst_strides = y_desc->strides();
auto src_strides = x_desc->strides();
auto element_size = infiniSizeOf(dtype);
auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
CHECK_RESULT(result);
aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
void *workspace = nullptr;
aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
&workspace_size, &executor);
if (workspace_size != 0) {
CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
}
*desc_ptr = new Descriptor(
result.take(),
new Opaque{
dst,
src,
workspace,
workspace_size},
handle->device,
handle->device_id);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *y,
const void *x,
void *stream) const {
auto tdst = _opaque->dst->tensor;
auto tsrc = _opaque->src->tensor;
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
AclSetTensorAddr(executor, 0, tdst, y);
AclSetTensorAddr(executor, 1, tsrc, (void *)x);
CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
// Execute InplaceCopy
CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
executor, stream));
return INFINI_STATUS_SUCCESS;
}
} // namespace op::rearrange::ascend
#ifndef __REARRANGE_ASCEND_H__
#define __REARRANGE_ASCNED_H__
#include "../rearrange.h"
DESCRIPTOR(ascend)
#endif // __REARRANGE_ASCEND_H__
......@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
......@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
......@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
......@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
......
......@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t w;
aclnnTensorDescriptor_t rstd;
size_t workspaceSize;
aclOpExecutor *executor;
~Opaque() {
delete y;
delete x;
delete w;
delete rstd;
aclDestroyAclOpExecutor(executor);
}
};
......@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(
// Get WorkspaceSize and set executor
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
aclSetAclOpExecutorRepeatable(executor);
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
*desc_ptr = new Descriptor(
new Opaque{y, x, w, rstd, workspace_size},
new Opaque{y, x, w, rstd, workspace_size, executor},
std::move(info),
all_workspace_size,
handle_ascend->device, handle_ascend->device_id);
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
......@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
auto tx = _opaque->x->tensor;
auto ty = _opaque->y->tensor;
auto trstd = _opaque->rstd->tensor;
size_t workspace_size_ = 0;
aclOpExecutor *executor = nullptr;
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);
auto unit = infiniSizeOf(_info.atype);
AclSetTensorAddr(executor, 1, tw, (void *)w);
AclSetTensorAddr(executor, 3, trstd, rstdPtr);
AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
for (size_t i = 0; i < (_info.shape)[0]; ++i) {
AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
}
return INFINI_STATUS_SUCCESS;
}
......
#ifndef __RMS_NORM_KUNLUN_KERNEL_XPU__
#define __RMS_NORM_KUNLUN_KERNEL_XPU__
#include "../../../devices/kunlun/kunlun_common.h"
#include "../../../devices/kunlun/kunlun_kernel_common.h"
#include "../../../reduce/kunlun/reduce_kunlun.h"
using namespace device::kunlun::kernel;
// Element wise mul used in x * w
static inline __device__ void elementwiseMulRms(float *x, float *w, float *y, int count, float rms) {
int remain = count % 16;
......
#include "rope_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace op::rope::ascend {
Descriptor::~Descriptor()
= default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
auto handle_ascned = reinterpret_cast<device::ascend::Handle *>(handle);
auto result = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
CHECK_RESULT(result);
size_t workspace_size = 0;
*desc_ptr = new Descriptor(std::move(result.take()), workspace_size, nullptr, handle_ascned->device, handle_ascned->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *pos_ids,
const void *sin_table,
const void *cos_table,
void *stream) const {
CHECK_DTYPE(_info.data_type, INFINI_DTYPE_F32, INFINI_DTYPE_F16);
auto data_type = _info.data_type;
auto pos_type = _info.pos_type;
auto seq_len = _info.seqlen;
auto nhead = _info.nhead;
auto dhead = _info.dhead;
auto y_stride_seqlen = _info.y_stride_seqlen;
auto y_stride_nhead = _info.y_stride_nhead;
auto x_stride_seqlen = _info.x_stride_seqlen;
auto x_stride_nhead = _info.x_stride_nhead;
return rope_kernel_launch(y, (void *)x, (void *)pos_ids, (void *)sin_table, (void *)cos_table, seq_len, nhead, dhead, data_type, pos_type, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead, stream);
}
} // namespace op::rope::ascend
#ifndef __ACLNN_ROPE_H__
#define __ACLNN_ROPE_H__
#include "../rope.h"
extern "C" infiniStatus_t rope_kernel_launch(
void *y,
void *x,
void *pos,
void *sin,
void *cos,
size_t seq_len,
size_t nhead,
size_t dhead,
infiniDtype_t data_type,
infiniDtype_t pos_type,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead,
void *stream);
DESCRIPTOR(ascend)
#endif // __ACLNN_ROPE_H__
#include "../../../devices/ascend/ascend_kernel_common.h"
using namespace AscendC;
template <typename T, typename U>
class RoPEKernel {
public:
__aicore__ inline RoPEKernel() {}
// Init op
// pos position vector
// x input tensor
// y output tensor
// tensor shape [nt, nh, dh]
// make block_num = nh, tile_len = dh
__aicore__ inline void init(GM_ADDR y,
GM_ADDR x,
GM_ADDR pos,
GM_ADDR sin,
GM_ADDR cos,
size_t dh,
ptrdiff_t st_ynt,
ptrdiff_t st_ynh,
ptrdiff_t st_xnt,
ptrdiff_t st_xnh);
__aicore__ inline void process(size_t seq_len);
private:
// Copy a tile into UB
__aicore__ inline void copyIn(size_t i);
__aicore__ inline void compute(size_t i);
__aicore__ inline void copyOut(size_t i);
private:
TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> _in_que;
TQue<QuePosition::VECIN, BUFFER_NUM> _sin_que;
TQue<QuePosition::VECIN, BUFFER_NUM> _cos_que;
TQue<QuePosition::VECOUT, BUFFER_NUM> _out_que;
TBuf<TPosition::VECCALC> _tmp_odd_buf;
TBuf<TPosition::VECCALC> _tmp_even_buf;
TBuf<TPosition::VECCALC> _tmp_odd_buf1;
TBuf<TPosition::VECCALC> _tmp_odd_buf2;
TBuf<TPosition::VECCALC> _tmp_even_buf1;
TBuf<TPosition::VECCALC> _tmp_even_buf2;
GlobalTensor<T> _x_gm, _y_gm;
GlobalTensor<U> _p_gm;
GlobalTensor<T> _sin_gm;
GlobalTensor<T> _cos_gm;
size_t _block_idx;
size_t _tile_len;
size_t _copy_len;
size_t _half_copy_len;
// stridey[_st_ynt, _st_ynh, 1]
ptrdiff_t _st_ynt;
ptrdiff_t _st_ynh;
// stridex[_st_xnt, _st_xnh, 1]
ptrdiff_t _st_xnt;
ptrdiff_t _st_xnh;
};
template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::init(GM_ADDR y,
GM_ADDR x,
GM_ADDR pos,
GM_ADDR sin,
GM_ADDR cos,
size_t dh,
ptrdiff_t st_ynt,
ptrdiff_t st_ynh,
ptrdiff_t st_xnt,
ptrdiff_t st_xnh) {
this->_tile_len = dh;
this->_st_ynt = st_ynt;
this->_st_ynh = st_ynh;
this->_st_xnt = st_xnt;
this->_st_xnh = st_xnh;
_copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
_half_copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
_block_idx = GetBlockIdx();
// Init global buffer
_x_gm.SetGlobalBuffer((__gm__ T *)x);
_p_gm.SetGlobalBuffer((__gm__ U *)pos);
_sin_gm.SetGlobalBuffer((__gm__ T *)sin);
_cos_gm.SetGlobalBuffer((__gm__ T *)cos);
_y_gm.SetGlobalBuffer((__gm__ T *)y);
// Init Queue buffer
pipe.InitBuffer(_in_que, BUFFER_NUM, _copy_len * sizeof(T));
pipe.InitBuffer(_out_que, BUFFER_NUM, _tile_len * sizeof(T));
pipe.InitBuffer(_sin_que, BUFFER_NUM, _half_copy_len * sizeof(T));
pipe.InitBuffer(_cos_que, BUFFER_NUM, _half_copy_len * sizeof(T));
pipe.InitBuffer(_tmp_odd_buf, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(_tmp_even_buf, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(_tmp_odd_buf1, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(_tmp_odd_buf2, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(_tmp_even_buf1, _tile_len / 2 * sizeof(T));
pipe.InitBuffer(_tmp_even_buf2, _tile_len / 2 * sizeof(T));
}
template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::copyIn(size_t i) {
LocalTensor<T> input_ub = _in_que.AllocTensor<T>();
LocalTensor<T> sin_ub = _sin_que.AllocTensor<T>();
LocalTensor<T> cos_ub = _cos_que.AllocTensor<T>();
// Get idx of current tile in total input
auto idx = i * _st_xnt + _block_idx * _st_xnh;
// Copy tile current tile into UB
DataCopy(input_ub, _x_gm[idx], _copy_len);
// Copy sin cos tile
auto pos_idx = _p_gm(i);
DataCopy(sin_ub, _sin_gm[pos_idx * _tile_len / 2], _half_copy_len);
DataCopy(cos_ub, _cos_gm[pos_idx * _tile_len / 2], _half_copy_len);
// Push in operands
_in_que.EnQue(input_ub);
_sin_que.EnQue(sin_ub);
_cos_que.EnQue(cos_ub);
}
template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::compute(size_t i) {
LocalTensor<T> input_ub = _in_que.DeQue<T>();
LocalTensor<T> sin_ub = _sin_que.DeQue<T>();
LocalTensor<T> cos_ub = _cos_que.DeQue<T>();
LocalTensor<T> output_ub = _out_que.AllocTensor<T>();
LocalTensor<T> tmp_odd = _tmp_odd_buf.Get<T>();
LocalTensor<T> tmp_even = _tmp_even_buf.Get<T>();
LocalTensor<T> tmp_odd1 = _tmp_odd_buf1.Get<T>();
LocalTensor<T> tmp_odd2 = _tmp_odd_buf2.Get<T>();
LocalTensor<T> tmp_even1 = _tmp_even_buf1.Get<T>();
LocalTensor<T> tmp_even2 = _tmp_even_buf2.Get<T>();
// separate odd and even bit elements
uint64_t rsvdCnt = 0;
GatherMaskParams gMaskParams = {
1,
static_cast<uint16_t>((_tile_len * sizeof(T) + 255) / 256), // no more than 256(<=255)
8,
8,
};
GatherMask<T>(tmp_odd, input_ub, 1, false, 0, gMaskParams, rsvdCnt);
GatherMask<T>(tmp_even, input_ub, 2, false, 0, gMaskParams, rsvdCnt);
PipeBarrier<PIPE_V>();
// compute odd bit elements
// y_odd = x_odd * cos - x_even * sin
Mul<T>(tmp_odd1, tmp_odd, cos_ub, _tile_len / 2);
Mul<T>(tmp_odd2, tmp_even, sin_ub, _tile_len / 2);
PipeBarrier<PIPE_V>();
Sub<T>(tmp_odd1, tmp_odd1, tmp_odd2, _tile_len / 2);
// compute even bit elements
// y_even = x_odd * sin + x_even * cos
Mul<T>(tmp_even1, tmp_odd, sin_ub, _tile_len / 2);
Mul<T>(tmp_even2, tmp_even, cos_ub, _tile_len / 2);
PipeBarrier<PIPE_V>();
Add<T>(tmp_even1, tmp_even1, tmp_even2, _tile_len / 2);
// combine odd and even bit elements
for (uint32_t j = 0; j < _tile_len / 2; j += 1) {
output_ub(j * 2) = tmp_odd1(j);
output_ub(j * 2 + 1) = tmp_even1(j);
}
_out_que.EnQue<T>(output_ub);
_in_que.FreeTensor(input_ub);
_sin_que.FreeTensor(sin_ub);
_cos_que.FreeTensor(cos_ub);
}
template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::copyOut(size_t i) {
LocalTensor<T> output_ub = _out_que.DeQue<T>();
auto idy = i * _st_ynt + _block_idx * _st_ynh;
DataCopyExtParams params = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
DataCopyPad(_y_gm[idy], output_ub, params);
_out_que.FreeTensor(output_ub);
}
template <typename T, typename U>
__aicore__ inline void RoPEKernel<T, U>::process(size_t seq_len) {
for (size_t i = 0; i < seq_len; ++i) {
copyIn(i);
compute(i);
copyOut(i);
}
}
#define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead, \
y_stride_seqlen, y_stride_nhead, \
x_stride_seqlen, x_stride_nhead
#define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
case POS_TYPE_ENUM: { \
RoPEKernel<TYPE, POS_T> op; \
op.init(ROPE_KERNEL_INIT_ARGS); \
op.process(seq_len); \
break; \
}
#define ROPE_KERNEL(TYPE, POSTYPE) \
switch (POSTYPE) { \
CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t) \
CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t) \
CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t) \
CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t) \
CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t) \
CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
default: \
break; \
}
#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR y, \
GM_ADDR x, \
GM_ADDR pos, \
GM_ADDR sin, \
GM_ADDR cos, \
size_t seq_len, \
size_t dhead, \
ptrdiff_t y_stride_seqlen, \
ptrdiff_t y_stride_nhead, \
ptrdiff_t x_stride_seqlen, \
ptrdiff_t x_stride_nhead, \
int32_t pos_type) { \
ROPE_KERNEL(TYPE, pos_type) \
}
DEFINE_ROPE_KERNEL(rope_kernel_float, float)
DEFINE_ROPE_KERNEL(rope_kernel_half, half)
#undef DEFINE_ROPE_KERNEL
#undef ROPE_KERNEL
#undef CASE_POSTYPE
#undef ROPE_KERNEL_INIT_ARGS
extern "C" infiniStatus_t rope_kernel_launch(
void *y,
void *x,
void *pos,
void *sin,
void *cos,
size_t seq_len,
size_t nhead,
size_t dhead,
infiniDtype_t dtype,
infiniDtype_t pos_type,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead,
void *stream) {
#define LAUNCH_ROPE_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<nhead, nullptr, stream>>>(y, x, pos, sin, cos, \
seq_len, \
dhead, \
y_stride_seqlen, \
y_stride_nhead, \
x_stride_seqlen, \
x_stride_nhead, \
pos_type); \
return INFINI_STATUS_SUCCESS;
switch (dtype) {
LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F16, rope_kernel_half)
LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F32, rope_kernel_float)
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
......@@ -8,6 +8,9 @@
#ifdef ENABLE_CUDA_API
#include "cuda/rope_cuda.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rope_ascend.h"
#endif
__C infiniStatus_t infiniopCreateRoPEDescriptor(
infiniopHandle_t handle,
......@@ -43,12 +46,8 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendCreateRoPEDescriptor((AscendHandle_t)handle,
(RoPEAscendDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
......@@ -90,10 +89,8 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
return bangGetRoPEWorkspaceSize((RoPEBangDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendGetRoPEWorkspaceSize((RoPEAscendDescriptor_t)desc, size);
}
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
......@@ -141,12 +138,8 @@ __C infiniStatus_t infiniopRoPE(
t, pos_ids, sin_table, cos_table, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendRoPE((RoPEAscendDescriptor_t)desc, workspace,
workspace_size, t, pos_ids, sin_table, cos_table,
stream);
}
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
......@@ -187,10 +180,8 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
return bangDestroyRoPEDescriptor((RoPEBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendDestroyRoPEDescriptor((RoPEAscendDescriptor_t)desc);
}
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
......
#include "swiglu_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace op::swiglu::ascend {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
std::vector<infiniopTensorDescriptor_t> input_descs) {
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
auto dtype = c_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
const auto &a_desc = input_descs[0];
const auto &b_desc = input_descs[1];
auto result = SwigluInfo::create(c_desc, a_desc, b_desc);
CHECK_RESULT(result);
SwigluInfo info = result.take();
// https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/ascendcopapi/atlasascendc_api_07_0777.html
size_t workspace_size = 0;
*desc_ptr = new Descriptor(std::move(info), workspace_size, handle_ascend->device, handle_ascend->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace,
size_t workspace_size,
void *c,
std::vector<const void *> inputs,
void *stream) const {
auto batch = _info.ndim == 2 ? 1 : _info.shape[0];
auto seq_len = _info.ndim == 2 ? _info.shape[0] : _info.shape[1];
auto hidden_size = _info.shape[_info.ndim - 1];
auto stride_batch_c = _info.ndim == 2 ? 1 : _info.c_strides[0];
auto stride_batch_a = _info.ndim == 2 ? 1 : _info.a_strides[0];
auto stride_batch_b = _info.ndim == 2 ? 1 : _info.b_strides[0];
auto stride_seq_c = _info.ndim == 2 ? _info.c_strides[0] : _info.c_strides[1];
auto stride_seq_a = _info.ndim == 2 ? _info.a_strides[0] : _info.a_strides[1];
auto stride_seq_b = _info.ndim == 2 ? _info.b_strides[0] : _info.b_strides[1];
auto status = swiglu_kernel_launch(c, (void *)inputs[0], (void *)inputs[1], _info.dtype, batch, seq_len, hidden_size, stride_batch_c, stride_batch_a, stride_batch_b, stride_seq_c, stride_seq_a, stride_seq_b, stream);
return status;
}
} // namespace op::swiglu::ascend
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment