Unverified Commit 784139b9 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #990 from InfiniTensor/demo131

Demo-131 Cuda graph with optimized paged attention
parents 3c8fb3c0 1d6527cb
......@@ -140,20 +140,9 @@ void cutlass_int8_scaled_mm(
typename Gemm::Arguments args{
{m, n, k}, {a_ptr, lda}, {b_ptr, ldb}, {b_s_ptr, 0}, {a_s_ptr, 0}, {bias_ptr, ldc}, {o_ptr, ldd}, visitor_args};
/* 需要先看看是否需要workspace */
// auto workspace = torch::empty(
// gemm_op.get_workspace_size(args), torch::TensorOptions().dtype(torch::kUInt8).device(mat_a.device()));
// auto can_implement = gemm_op.can_implement(args);
check_cutlass_status(gemm_op.can_implement(args));
// TORCH_CHECK(
// can_implement == cutlass::Status::kSuccess,
// "gemm cannot implement, error: ",
// cutlassGetStatusString(can_implement));
auto status = gemm_op(args, nullptr, (cudaStream_t)stream);
check_cutlass_status(status);
// TORCH_CHECK(status == cutlass::Status::kSuccess, "gemm executioin failed, error: ", cutlassGetStatusString(status));
}
template <typename ElementOutput, typename ArchTag, typename InstructionShape>
......
#ifdef ENABLE_CUTLASS_API
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#ifdef ENABLE_CUTLASS_API
#include "int8_gemm_kernel.cuh"
#endif
#include "../cuda/per_channel_dequant_int8.cuh"
#include "int8_gemm_nvidia.cuh"
template <typename Tdata>
INFINIOP_CUDA_KERNEL postSym(
Tdata *y, int32_t *y_packed, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
postSymKernel<Tdata>(y, y_packed, bias, x_packed, x_scale, w_packed, w_scale, M, K, N);
}
template <typename Tdata>
INFINIOP_CUDA_KERNEL postSym(
Tdata *y, int32_t *y_packed, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
postSymKernel<Tdata>(y, y_packed, x_packed, x_scale, w_packed, w_scale, M, K, N);
}
namespace op::i8gemm::nvidia {
struct Descriptor::Opaque {
......@@ -14,6 +28,7 @@ Descriptor::~Descriptor() {
delete _opaque;
}
#ifdef ENABLE_NVIDIA_API
inline int getSMVersion() {
int device{-1};
CHECK_CUDA(cudaGetDevice(&device));
......@@ -23,6 +38,7 @@ inline int getSMVersion() {
CHECK_CUDA(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
return sm_major * 10 + sm_minor;
}
#endif
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
......@@ -40,14 +56,65 @@ infiniStatus_t Descriptor::create(
auto result = I8GemmInfo::create(out_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
CHECK_RESULT(result);
size_t workspace_size = out_desc->dim(0) * out_desc->dim(1) * sizeof(int32_t);
*desc_ptr = new Descriptor(
new Opaque{handle->internal()},
result.take(), 0, dtype,
result.take(), workspace_size, dtype,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
#ifdef ENABLE_QY_API
template <unsigned int BLOCK_SIZE, typename Tdata>
infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, void *stream_, void *workspace) const {
cudaStream_t stream = (cudaStream_t)stream_;
int M = (int)info.m;
int K = (int)info.k;
int N = (int)info.n;
char *workspace_ptr = reinterpret_cast<char *>(workspace);
int32_t *y_packed = reinterpret_cast<int32_t *>(workspace_ptr);
const int32_t alpha_I = 1;
const int32_t beta_I = 0;
int lda = K; // w_packed is column-major [K, N]
int ldb = K; // x_packed is row-major [M, K]
int ldc = N; // y_packed is row-major [M, N]
CHECK_STATUS(this->_opaque->internal->useCublas(
stream,
[&](cublasHandle_t handle) {
CHECK_CUBLAS(cublasGemmEx(
handle,
CUBLAS_OP_T, // A = w_packed^T : [N, K]
CUBLAS_OP_N, // B = x_packed^T viewed column-major : [K, M]
N, // m
M, // n
K, // k
&alpha_I,
w_packed, CUDA_R_8I, lda,
x_packed, CUDA_R_8I, ldb,
&beta_I,
y_packed, CUDA_R_32I, ldc,
CUBLAS_COMPUTE_32I,
CUBLAS_GEMM_DEFAULT));
return INFINI_STATUS_SUCCESS;
}));
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
int num_block_x = (N + BLOCK_SIZE_x - 1) / BLOCK_SIZE_x;
int num_block_y = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, num_block_y, 1);
if (bias == nullptr) {
postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, x_packed, x_scale, w_packed, w_scale, M, K, N);
} else {
postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, bias, x_packed, x_scale, w_packed, w_scale, M, K, N);
}
return INFINI_STATUS_SUCCESS;
}
#endif
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
......@@ -58,6 +125,7 @@ infiniStatus_t Descriptor::calculate(
const void *b,
const void *b_scale,
void *stream) const {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
auto sm_version = getSMVersion();
if (sm_version >= 75 && sm_version < 80) {
CHECK_DTYPE(this->_out_dtype, INFINI_DTYPE_F16);
......@@ -111,7 +179,30 @@ infiniStatus_t Descriptor::calculate(
} else {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
#elif defined ENABLE_QY_API
#define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \
launchKernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)out, (const TDATA *)bias, (const int8_t *)a, (const float *)a_scale, (const int8_t *)b, (const float *)b_scale, stream, workspace)
#define CALCULATE_LINEAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (this->_out_dtype == INFINI_DTYPE_F16) \
return CALCULATE_LINEAR(BLOCK_SIZE, half); \
else if (this->_out_dtype == INFINI_DTYPE_F32) \
return CALCULATE_LINEAR(BLOCK_SIZE, float); \
else if (this->_out_dtype == INFINI_DTYPE_BF16) \
return CALCULATE_LINEAR(BLOCK_SIZE, __nv_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
#endif
return INFINI_STATUS_SUCCESS;
}
} // namespace op::i8gemm::nvidia
#endif
\ No newline at end of file
......@@ -2,10 +2,14 @@
#include "../../handle.h"
#include "infiniop/ops/int8_gemm.h"
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/int8_gemm_nvidia.cuh"
#endif
#if defined(ENABLE_MOORE_API)
#include "moore/int8_gemm_moore.h"
#endif
__C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
infiniopI8GemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
......@@ -26,8 +30,14 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
b_desc, \
b_scale_desc);
switch (handle->device) {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
CREATE(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
CREATE(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -41,8 +51,14 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
case CASE: \
*size = reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
return INFINI_STATUS_SUCCESS;
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
GET(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
GET(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -65,8 +81,14 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
return reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, out, bias, a, a_scale, b, b_scale, stream);
switch (desc->device_type) {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
CACULATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
CACULATE(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
CACULATE(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -80,8 +102,14 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
delete reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
DESTROY(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
DESTROY(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sigmoid_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/sigmoid_nvidia.cuh"
#endif
......@@ -34,6 +34,12 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -58,6 +64,12 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -91,7 +103,12 @@ __C infiniStatus_t infiniopSigmoid(
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -118,7 +135,12 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/silu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_ALI_API)
#include "nvidia/silu_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -46,6 +46,9 @@ __C infiniStatus_t infiniopCreateSiluDescriptor(
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -77,6 +80,10 @@ __C infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, s
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -115,6 +122,9 @@ __C infiniStatus_t infiniopSilu(
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -148,6 +158,9 @@ infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc) {
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __SILU_AND_MUL_INFO_H__
#define __SILU_AND_MUL_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace op::silu_and_mul {
class SiluAndMulInfo {
SiluAndMulInfo() = default;
public:
infiniDtype_t dtype;
size_t batch_size;
size_t out_hidden_dim;
static utils::Result<SiluAndMulInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc) {
auto dtype = y_desc->dtype();
auto x_shape = x_desc->shape();
auto y_shape = y_desc->shape();
auto ndim = x_desc->ndim();
if (ndim != y_desc->ndim()) {
return INFINI_STATUS_BAD_PARAM;
}
if (x_shape[ndim - 1] != 2 * y_shape[ndim - 1]) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
size_t batch = 1;
for (int i = 0; i < (int)ndim - 1; ++i) {
if (x_shape[i] != y_shape[i]) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
batch *= y_shape[i];
}
return utils::Result<SiluAndMulInfo>(SiluAndMulInfo{
dtype,
batch,
y_shape[ndim - 1]});
}
private:
SiluAndMulInfo(infiniDtype_t dtype, size_t batch, size_t hidden)
: dtype(dtype), batch_size(batch), out_hidden_dim(hidden) {}
};
} // namespace op::silu_and_mul
#endif // __SILU_AND_MUL_INFO_H__
#ifndef __SILU_ADN_MUL_MOORE_API_H__
#define __SILU_ADN_MUL_MOORE_API_H__
#include "../silu_and_mul.h"
DESCRIPTOR(moore)
#endif // __SILU_ADN_MUL_MOORE_API_H__
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_handle.h"
#include "silu_and_mul_moore.h"
#include <musa_bf16.h>
#include <memory>
namespace op::silu_and_mul::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
if (!desc_ptr) {
return INFINI_STATUS_BAD_PARAM;
}
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = y_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
if (x_desc->dtype() != dtype) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
auto result = SiluAndMulInfo::create(y_desc, x_desc);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{handle->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t calculate_impl(
const SiluAndMulInfo &info,
std::shared_ptr<device::moore::Handle::Internal> &internal,
void *y,
const void *x,
void *stream) {
return internal->useMudnn(
(musaStream_t)stream,
[&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t {
::musa::dnn::Tensor x_t, y_t;
if constexpr (std::is_same_v<T, half>) {
x_t.SetType(::musa::dnn::Tensor::Type::HALF);
y_t.SetType(::musa::dnn::Tensor::Type::HALF);
} else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
x_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
y_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
} else {
x_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
y_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
}
x_t.SetAddr(const_cast<void *>(x));
y_t.SetAddr(y);
// --- Construct 2D dimension information ---
// Explicitly distinguish between Batch and Hidden dimensions
int64_t b = static_cast<int64_t>(info.batch_size);
int64_t h = static_cast<int64_t>(info.out_hidden_dim);
// Input x logical shape is [batch, 2 * hidden]
std::array<int64_t, 2> x_dims = {b, h * 2};
std::array<int64_t, 2> x_strides = {h * 2, 1};
// Output y logical shape is [batch, hidden]
std::array<int64_t, 2> y_dims = {b, h};
std::array<int64_t, 2> y_strides = {h, 1};
x_t.SetNdInfo(2, x_dims.data(), x_strides.data());
y_t.SetNdInfo(2, y_dims.data(), y_strides.data());
// Invoke muDNN SwiGLU
// muDNN will split each row (length 2*h) internally,
// muDNN treats the first h elements of input x as the 'gate'
// and the following h elements as the 'up' projection.
::musa::dnn::SwiGlu swiglu;
swiglu.Run(mudnn_handle, y_t, x_t);
return INFINI_STATUS_SUCCESS;
});
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
infiniDtype_t dtype = _info.dtype;
switch (dtype) {
case INFINI_DTYPE_F16:
return calculate_impl<half>(_info, _opaque->internal, y, x, stream);
case INFINI_DTYPE_F32:
return calculate_impl<float>(_info, _opaque->internal, y, x, stream);
case INFINI_DTYPE_BF16:
return calculate_impl<__mt_bfloat16>(_info, _opaque->internal, y, x, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::silu_and_mul::moore
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/silu_and_mul.h"
#ifdef ENABLE_MOORE_API
#include "moore/silu_and_mul_moore.h"
#endif
__C infiniStatus_t infiniopCreateSiluAndMulDescriptor(
infiniopHandle_t handle,
infiniopSiluAndMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::silu_and_mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::silu_and_mul::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc);
switch (handle->device) {
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(infiniopSiluAndMulDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopSiluAndMul(
infiniopSiluAndMulDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, stream);
switch (desc->device_type) {
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopDestroySiluAndMulDescriptor(infiniopSiluAndMulDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#ifndef SILU_AND_MUL_H
#define SILU_AND_MUL_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::silu_and_mul::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
SiluAndMulInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
SiluAndMulInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
#endif // SILU_AND_MUL_H
......@@ -128,6 +128,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -2,7 +2,7 @@
#include "../../handle.h"
#include "infiniop/ops/softmax.h"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#include "nvidia/softmax_nvidia.cuh"
#endif
......@@ -33,6 +33,9 @@ __C infiniStatus_t infiniopCreateSoftmaxDescriptor(
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -57,6 +60,9 @@ __C infiniStatus_t infiniopGetSoftmaxWorkspaceSize(infiniopSoftmaxDescriptor_t d
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -86,6 +92,9 @@ __C infiniStatus_t infiniopSoftmax(
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -110,6 +119,9 @@ __C infiniStatus_t infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_ALI_API
DESTROY(INFINI_DEVICE_ALI, nvidia);
#endif
}
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/softplus_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#include "nvidia/softplus_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -49,6 +49,10 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor(
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -82,6 +86,10 @@ __C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -123,6 +131,10 @@ __C infiniStatus_t infiniopSoftplus(
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -158,6 +170,10 @@ infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) {
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sub_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#include "nvidia/sub_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -85,6 +88,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -128,6 +134,9 @@ __C infiniStatus_t infiniopSub(
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -164,6 +173,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
import ninetoothed
from . import swiglu
import infiniop.ninetoothed.build
def build():
MAX_NDIM = 5
ndim_values = range(1, MAX_NDIM + 1)
dtype_values = (
ninetoothed.float16,
ninetoothed.bfloat16,
ninetoothed.float32,
)
constexpr_param_grid = {
"ndim": ndim_values,
"dtype": dtype_values,
"block_size": (1024,),
}
infiniop.ninetoothed.build.build(
swiglu.premake,
constexpr_param_grid,
caller="cuda",
op_name="swiglu",
output_dir=infiniop.ninetoothed.build.BUILD_DIRECTORY_PATH,
)
#ifndef SWIGLU_H
#define SWIGLU_H
#include "../../../handle.h"
#include "../../../operator.h"
#include "../../../tensor.h"
#include "../../../../../build/ninetoothed/swiglu.h"
#include "../../../ninetoothed/utils.h"
namespace op::swiglu::ninetoothed {
class Descriptor final : public InfiniopDescriptor {
public:
Descriptor(
infiniopHandle_t handle,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) : InfiniopDescriptor{handle->device, handle->device_id},
out_shape_{out_desc->shape()},
out_strides_{out_desc->strides()},
up_shape_{input_desc_vec[0]->shape()},
up_strides_{input_desc_vec[0]->strides()},
gate_shape_{input_desc_vec[1]->shape()},
gate_strides_{input_desc_vec[1]->strides()},
dtype_{out_desc->dtype()} {}
~Descriptor() = default;
size_t workspaceSize() const {
return 0;
}
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
*desc_ptr = new Descriptor(handle, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
auto out_nt{::ninetoothed::Tensor(output, out_shape_, out_strides_)};
auto up_nt{::ninetoothed::Tensor(inputs[0], up_shape_, up_strides_)};
auto gate_nt{::ninetoothed::Tensor(inputs[1], gate_shape_, gate_strides_)};
if (launch_swiglu(stream,
out_nt,
up_nt,
gate_nt,
out_shape_.size(),
dtype_,
1024)) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
return INFINI_STATUS_SUCCESS;
}
private:
using Size = ::ninetoothed::Tensor<>::Size;
using Stride = ::ninetoothed::Tensor<>::Stride;
std::vector<Size> out_shape_;
std::vector<Stride> out_strides_;
std::vector<Size> up_shape_;
std::vector<Stride> up_strides_;
std::vector<Size> gate_shape_;
std::vector<Stride> gate_strides_;
infiniDtype_t dtype_;
};
} // namespace op::swiglu::ninetoothed
#endif // SWIGLU_H
import functools
import ninetoothed.language as ntl
from ninetoothed import Tensor
from ntops.kernels.element_wise import arrangement
def application(output, up, gate):
output = ntl.sigmoid(ntl.cast(gate, ntl.float32)) * gate * up # noqa: F841
def premake(ndim, dtype=None, block_size=None):
arrangement_ = functools.partial(arrangement, block_size=block_size)
tensors = (
Tensor(ndim, dtype=dtype),
Tensor(ndim, dtype=dtype),
Tensor(ndim, dtype=dtype),
)
return arrangement_, application, tensors
......@@ -5,15 +5,23 @@
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "nvidia/swiglu_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_METAX_API
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "metax/swiglu_metax.h"
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/swiglu_bang.h"
#endif
......@@ -46,11 +54,22 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -61,8 +80,12 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_METAX, ninetoothed);
#else
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -92,11 +115,22 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -107,8 +141,12 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_METAX, ninetoothed);
#else
GET(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -145,11 +183,22 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -160,8 +209,12 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_METAX, ninetoothed);
#else
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -193,11 +246,22 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_NVIDIA, ninetoothed);
#else
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_ILUVATAR, ninetoothed);
#else
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
......@@ -208,8 +272,12 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_METAX, ninetoothed);
#else
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/tanh_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#include "nvidia/tanh_nvidia.cuh"
#endif
// #ifdef ENABLE_METAX_API
......@@ -40,6 +40,10 @@ __C infiniStatus_t infiniopCreateTanhDescriptor(
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
// #ifdef ENABLE_METAX_API
// CREATE(INFINI_DEVICE_METAX, metax);
// #endif
......@@ -71,6 +75,10 @@ __C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, s
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
// #ifdef ENABLE_METAX_API
// GET(INFINI_DEVICE_METAX, metax);
// #endif
......@@ -109,6 +117,10 @@ __C infiniStatus_t infiniopTanh(
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
// #ifdef ENABLE_METAX_API
// CALCULATE(INFINI_DEVICE_METAX, metax);
// #endif
......@@ -142,6 +154,10 @@ infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
// #ifdef ENABLE_METAX_API
// DELETE(INFINI_DEVICE_METAX, metax);
// #endif
......
......@@ -27,6 +27,15 @@ struct CustomLess {
}
};
// Warp-level sum reduction for Hygon platform
template <int warp_threads>
__inline__ __device__ float WarpSum(float val) {
for (int mask = warp_threads / 2; mask > 0; mask /= 2) {
val += __shfl_xor_sync(0xffffffff, val, mask);
}
return val;
}
template <typename T, int BLOCK_THREADS = 256>
__global__ void topkrouter_kernel(float *values_topk, // 输出数据, 形状[N, topk]
int *indices_topk, // 输出索引, 形状[N, topk]
......@@ -137,12 +146,19 @@ __global__ void topkrouter_kernel(float *values_topk, // 输出数
value = sigmoid_func(data_input[index]);
}
{
#ifdef ENABLE_HYGON_API
float warp_sum = WarpSum<warp_threads>(value);
if (0 == tid) {
share_sum = warp_sum + 1e-9f;
}
#else
typedef cub::WarpReduce<float, warp_threads> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
float warp_sum = WarpReduce(temp_storage).Sum(value);
if (0 == tid) {
share_sum = warp_sum + 1e-9f;
}
#endif
}
__syncwarp();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment