Unverified Commit 9ad23fad authored by blkmjsian's avatar blkmjsian Committed by GitHub
Browse files

[T2-2-3] blkmjsian

- dequantize awq
- rope v2
parent b3170335
#ifndef __INFINIOP_ROPE_V2_CPU_H__
#define __INFINIOP_ROPE_V2_CPU_H__
#include "../rope_v2.h"
DESCRIPTOR(cpu)
#endif // __INFINIOP_ROPE_V2_CPU_H__
#ifndef __INFINIOP_ROPE_V2_CUDA_KERNEL_CUH__
#define __INFINIOP_ROPE_V2_CUDA_KERNEL_CUH__
template <typename Tdata, typename Tindex, typename Tangle>
__device__ void ropeThreadPerItemBlock(
Tdata *y_,
const Tdata *x_,
const Tindex *__restrict__ pos_ids,
const Tangle *__restrict__ sin_table,
const Tangle *__restrict__ cos_table,
size_t table_dim,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
auto y_offset = blockIdx.x * y_stride_seqlen + blockIdx.y * y_stride_nhead;
auto x_offset = blockIdx.x * x_stride_seqlen + blockIdx.y * x_stride_nhead;
size_t pos_id = size_t(pos_ids[blockIdx.x]);
auto table_offset = pos_id * table_dim;
const size_t half_dim = table_dim; // Head dimension = 2 * table_dim
for (size_t i = threadIdx.x; i < table_dim; i += blockDim.x) {
Tangle sin__ = sin_table[table_offset + i];
Tangle cos__ = cos_table[table_offset + i];
// Calculate positions in first and second halves
size_t pos0 = i;
size_t pos1 = i + half_dim;
if constexpr (std::is_same<Tdata, half>::value) {
Tangle x0 = __half2float(x_[x_offset + pos0]);
Tangle x1 = __half2float(x_[x_offset + pos1]);
Tangle y0 = x0 * cos__ - x1 * sin__;
Tangle y1 = x0 * sin__ + x1 * cos__;
y_[y_offset + pos0] = __float2half(y0);
y_[y_offset + pos1] = __float2half(y1);
} else if constexpr (std::is_same<Tdata, cuda_bfloat16>::value) {
Tangle x0 = __bfloat162float(x_[x_offset + pos0]);
Tangle x1 = __bfloat162float(x_[x_offset + pos1]);
Tangle y0 = x0 * cos__ - x1 * sin__;
Tangle y1 = x0 * sin__ + x1 * cos__;
y_[y_offset + pos0] = __float2bfloat16(y0);
y_[y_offset + pos1] = __float2bfloat16(y1);
} else {
Tangle x0 = x_[x_offset + pos0];
Tangle x1 = x_[x_offset + pos1];
y_[y_offset + pos0] = x0 * cos__ - x1 * sin__;
y_[y_offset + pos1] = x0 * sin__ + x1 * cos__;
}
}
}
#endif
#ifndef __INFINIOP_ROPE_METAX_H__
#define __INFINIOP_ROPE_METAX_H__
#include "../rope.h"
DESCRIPTOR(metax)
#endif // __INFINIOP_ROPE_METAX_H__
#include "../../../devices/metax/metax_common.h"
#include "rope_metax.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
template <typename Tdata, typename Tindex, typename Tangle>
INFINIOP_METAX_KERNEL ropeThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
const Tindex *__restrict__ pos_ids,
const Tangle *__restrict__ sin_table,
const Tangle *__restrict__ cos_table,
size_t table_dim,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
y_stride_seqlen, y_stride_nhead,
x_stride_seqlen, x_stride_nhead);
}
namespace op::rope::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
CHECK_RESULT(info);
// Create descriptor
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tindex>
infiniStatus_t calculateRoPE(const RoPEInfo &info,
int block_size,
Tdata *y,
const Tdata *x,
const Tindex *pos_ids,
const Tdata *sin_table,
const Tdata *cos_table,
hcStream_t stream) {
auto dimx = uint32_t(info.seqlen),
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE_ROPE(TDATA, TINDEX) \
calculateRoPE(_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(const TINDEX *)pos_ids, \
(const TDATA *)sin_table, \
(const TDATA *)cos_table, \
(hcStream_t)stream)
#define ROPE_TYPE(TDATA) \
switch (_info.pos_type) { \
case INFINI_DTYPE_U8: \
return CALCULATE_ROPE(TDATA, uint8_t); \
case INFINI_DTYPE_U16: \
return CALCULATE_ROPE(TDATA, uint16_t); \
case INFINI_DTYPE_U32: \
return CALCULATE_ROPE(TDATA, uint32_t); \
case INFINI_DTYPE_U64: \
return CALCULATE_ROPE(TDATA, uint64_t); \
case INFINI_DTYPE_I8: \
return CALCULATE_ROPE(TDATA, int8_t); \
case INFINI_DTYPE_I16: \
return CALCULATE_ROPE(TDATA, int16_t); \
case INFINI_DTYPE_I32: \
return CALCULATE_ROPE(TDATA, int32_t); \
case INFINI_DTYPE_I64: \
return CALCULATE_ROPE(TDATA, int64_t); \
default: \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *pos_ids,
const void *sin_table,
const void *cos_table,
void *stream) const {
switch (_info.data_type) {
case INFINI_DTYPE_F16:
ROPE_TYPE(half);
case INFINI_DTYPE_BF16:
ROPE_TYPE(cuda_bfloat16);
case INFINI_DTYPE_F32:
ROPE_TYPE(float);
case INFINI_DTYPE_F64:
ROPE_TYPE(double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
#undef ROPE_TYPE
#undef CALCULATE_ROPE
} // namespace op::rope::metax
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "rope_v2_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
namespace op::rope_v2::nvidia {
template <typename Tdata, typename Tindex, typename Tangle>
INFINIOP_CUDA_KERNEL ropev2ThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
const Tindex *__restrict__ pos_ids,
const Tangle *__restrict__ sin_table,
const Tangle *__restrict__ cos_table,
size_t table_dim,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
y_stride_seqlen, y_stride_nhead,
x_stride_seqlen, x_stride_nhead);
}
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto info = RoPEv2Info::createRoPEv2Info(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
CHECK_RESULT(info);
// Create descriptor
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tindex>
infiniStatus_t calculateRoPEv2(const RoPEv2Info &info,
int block_size,
Tdata *y,
const Tdata *x,
const Tindex *pos_ids,
const Tdata *sin_table,
const Tdata *cos_table,
cudaStream_t stream) {
auto dimx = uint32_t(info.seqlen),
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropev2ThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE_ROPE_V2(TDATA, TINDEX) \
calculateRoPEv2(_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(const TINDEX *)pos_ids, \
(const TDATA *)sin_table, \
(const TDATA *)cos_table, \
(cudaStream_t)stream)
#define ROPE_TYPE(TDATA) \
switch (_info.pos_type) { \
case INFINI_DTYPE_U8: \
return CALCULATE_ROPE_V2(TDATA, uint8_t); \
case INFINI_DTYPE_U16: \
return CALCULATE_ROPE_V2(TDATA, uint16_t); \
case INFINI_DTYPE_U32: \
return CALCULATE_ROPE_V2(TDATA, uint32_t); \
case INFINI_DTYPE_U64: \
return CALCULATE_ROPE_V2(TDATA, uint64_t); \
case INFINI_DTYPE_I8: \
return CALCULATE_ROPE_V2(TDATA, int8_t); \
case INFINI_DTYPE_I16: \
return CALCULATE_ROPE_V2(TDATA, int16_t); \
case INFINI_DTYPE_I32: \
return CALCULATE_ROPE_V2(TDATA, int32_t); \
case INFINI_DTYPE_I64: \
return CALCULATE_ROPE_V2(TDATA, int64_t); \
default: \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *pos_ids,
const void *sin_table,
const void *cos_table,
void *stream) const {
switch (_info.data_type) {
case INFINI_DTYPE_F16:
ROPE_TYPE(half);
case INFINI_DTYPE_BF16:
ROPE_TYPE(cuda_bfloat16);
case INFINI_DTYPE_F32:
ROPE_TYPE(float);
case INFINI_DTYPE_F64:
ROPE_TYPE(double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
#undef ROPE_TYPE
#undef CALCULATE_ROPE
} // namespace op::rope_v2::nvidia
#ifndef __INFINIOP_ROPE_V2_CUDA_H__
#define __INFINIOP_ROPE_V2_CUDA_H__
#include "../rope_v2.h"
DESCRIPTOR(nvidia)
#endif // __INFINIOP_ROPE_V2_CUDA_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/rope_v2.h"
#ifdef ENABLE_CPU_API
#include "cpu/rope_v2_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/rope_v2_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rope_v2_ascend.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/rope_v2_bang.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/rope_v2_metax.h"
#endif
__C infiniStatus_t infiniopCreateRoPEv2Descriptor(
infiniopHandle_t handle,
infiniopRoPEv2Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t pos_ids,
infiniopTensorDescriptor_t sin_table,
infiniopTensorDescriptor_t cos_table) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::rope_v2::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::rope_v2::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
x, \
pos_ids, \
sin_table, \
cos_table)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateRoPEDescriptor((MusaHandle_t)handle,
(RoPEMusaDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
}
#undef CREATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopGetRoPEv2WorkspaceSize(infiniopRoPEv2Descriptor_t desc,
size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t)desc, size);
}
#endif
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopRoPEv2(
infiniopRoPEv2Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *pos_ids,
const void *sin_table,
const void *cos_table,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaRoPE((RoPEMusaDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
}
#undef CALCULATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t
infiniopDestroyRoPEv2Descriptor(infiniopRoPEv2Descriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::rope_v2::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t)desc);
}
#endif
}
#undef DELETE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#ifndef __ROPE_V2_H__
#define __ROPE_V2_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::rope_v2::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
RoPEv2Info _info; \
size_t _workspace_size; \
\
Descriptor( \
RoPEv2Info info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t pos_desc, \
infiniopTensorDescriptor_t sin_desc, \
infiniopTensorDescriptor_t cos_desc); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
const void *pos_ids, \
const void *sin_table, \
const void *cos_table, \
void *stream) const; \
}; \
}
class RoPEv2Info {
private:
RoPEv2Info() = default;
public:
infiniDtype_t data_type, pos_type;
size_t seqlen, nhead, dhead, table_len, table_dim;
ptrdiff_t
y_stride_seqlen,
y_stride_nhead,
x_stride_seqlen,
x_stride_nhead;
static utils::Result<RoPEv2Info> createRoPEv2Info(
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t pos_desc,
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
CHECK_OR_RETURN(
y_desc != nullptr && pos_desc != nullptr && sin_desc != nullptr && cos_desc != nullptr,
INFINI_STATUS_NULL_POINTER);
const infiniDtype_t data_type = y_desc->dtype();
const infiniDtype_t pos_type = pos_desc->dtype();
CHECK_OR_RETURN(data_type == x_desc->dtype() && data_type == sin_desc->dtype() && data_type == cos_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_DTYPE_ANY_INT(pos_type);
CHECK_OR_RETURN(y_desc->ndim() == 3
&& x_desc->ndim() == 3
&& pos_desc->ndim() == 1
&& sin_desc->ndim() == 2
&& cos_desc->ndim() == 2,
INFINI_STATUS_BAD_TENSOR_SHAPE);
const auto seqlen = y_desc->dim(0),
nhead = y_desc->dim(1),
dhead = y_desc->dim(2),
table_len = sin_desc->dim(0),
table_dim = sin_desc->dim(1);
CHECK_OR_RETURN(seqlen == x_desc->dim(0)
&& seqlen == pos_desc->dim(0)
&& nhead == x_desc->dim(1) && dhead == x_desc->dim(2)
&& table_len == cos_desc->dim(0) && table_dim == cos_desc->dim(1),
INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(dhead == table_dim * 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
// Last dimension of x and y must be contiguous
CHECK_OR_RETURN(y_desc->stride(2) == 1 && x_desc->stride(2) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
// sin table and cos table must be totally contiguous
CHECK_OR_RETURN(sin_desc->isContiguous() && cos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
return utils::Result<RoPEv2Info>(RoPEv2Info{
data_type,
pos_type,
seqlen,
nhead,
dhead,
table_len,
table_dim,
y_desc->stride(0),
y_desc->stride(1),
x_desc->stride(0),
x_desc->stride(1),
});
}
};
#endif
#include "topkrouter_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
namespace op::topkrouter::cpu {
Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t correction_bias_desc) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
float *values, int *indices, void *x, float *correction_bias,
float routed_scaling_factor,
size_t topk,
void *stream) const {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
} // namespace op::topkrouter::cpu
#ifndef __Topkrouter_CPU_H__
#define __Topkrouter_CPU_H__
#include "../topkrouter.h"
DESCRIPTOR(cpu)
#endif
#ifndef _Topkrouter_KERNEL_CUH__
#define _Topkrouter_KERNEL_CUH__
#include <cfloat>
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>
#include <cub/cub.cuh>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
template <typename T>
inline __device__ float exp_func(T x) {
float data;
if constexpr (std::is_same_v<T, float>) {
data = x;
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
data = __bfloat162float(x);
} else if constexpr (std::is_same_v<T, half>) {
data = __half2float(x);
}
return __expf(data);
}
template <typename T>
inline __device__ T sigmoid_func(T x) {
// sigmoid(x) = 1 / (1 + exp(-x))
return 1.0f / (1.0f + exp_func<T>(-x));
}
struct CustomLess {
template <typename DataType>
__device__ bool operator()(const DataType &lhs, const DataType &rhs) {
return lhs > rhs;
}
};
//
// deepseek的topk
//
template <typename T, int BLOCK_THREADS = 256>
__global__ void topkrouter_kernel(float *values_topk, // 输出值, 形状[N, topk]
int *indices_topk, // 输出索引, 形状[N, topk]
T *input, // 输入数据 [N, width]
float *d_correction_bias, // 输入数据 [width]
float routed_scaling_factor, //
const size_t N, // 总行数,toen数量
const size_t width, // 每行元素数量
const size_t topk
) {
const int bid = blockIdx.x;
if (bid >= N) {
return;
}
const int tid = threadIdx.x;
const T *data_input = input + bid * width;
float *values_topk_output = values_topk + bid * topk;
int *indices_topk_output = indices_topk + bid * topk;
constexpr int warp_threads = 32;
constexpr int block_threads = 256;
constexpr int warps_per_block = block_threads / warp_threads;
const int warp_id = tid / warp_threads;
const int lane_id = tid % warp_threads;
__shared__ float share_data[256];
__shared__ float share_data_group[8];
__shared__ float share_data_group_mask[8]; // 有效的group
__shared__ float share_sum;
if (tid < 8) {
share_data_group_mask[tid] = 0.0f;
}
// ------------------------------------------------------ //
// 对输入数据做 sigmoid //
// ------------------------------------------------------ //
float value = sigmoid_func(data_input[tid]);
// ------------------------------------------------------ //
// 对输入数据加偏执 //
// ------------------------------------------------------ //
value += d_correction_bias[tid];
// ----------------------------------------------------------- //
// 每个warp为一组,一共8组,找出每组的最大的前两个数据 //
// ----------------------------------------------------------- //
float thread_values[1] = {value};
int thread_indices[1] = {tid};
using WarpMergeSortT = cub::WarpMergeSort<float, 1, warp_threads, int>;
{
__shared__ typename WarpMergeSortT::TempStorage temp_storage[warps_per_block];
WarpMergeSortT(temp_storage[warp_id]).Sort(thread_values, thread_indices, CustomLess());
}
__syncthreads();
share_data[tid] = thread_values[0];
// ----------------------------------------------------------- //
// 每个组中,前两个数据的和 //
// ----------------------------------------------------------- //
__syncthreads();
if (0 == lane_id) {
share_data_group[warp_id] = share_data[warp_id * warp_threads] + share_data[warp_id * warp_threads + 1];
}
__syncthreads();
// ----------------------------------------------------------- //
// 再选前 4 个 //
// ----------------------------------------------------------- //
if (0 == warp_id) {
float thread_values[1] = {-FLT_MAX};
int thread_indices[1] = {-1};
if (lane_id < 8) {
thread_values[0] = share_data_group[lane_id];
thread_indices[0] = lane_id;
}
__shared__ typename WarpMergeSortT::TempStorage temp_storage[1];
WarpMergeSortT(temp_storage[0]).Sort(thread_values, thread_indices, CustomLess());
if (lane_id < 4) {
int indices = thread_indices[0];
share_data_group_mask[indices] = 1.0f;
}
}
__syncthreads();
// ----------------------------------------------------------- //
// 求得 最后一次topk //
// ----------------------------------------------------------- //
value = value * share_data_group_mask[warp_id];
thread_values[0] = value;
thread_indices[0] = tid;
{
typedef cub::BlockRadixSort<float, BLOCK_THREADS, 1, int> BlockRadixSort;
__shared__ typename BlockRadixSort::TempStorage temp_storage;
BlockRadixSort(temp_storage).SortDescending(thread_values, thread_indices);
}
__syncthreads();
// ----------------------------------------------------------- //
// 归一化 //
// ----------------------------------------------------------- //
if (0 == warp_id) {
value = 0.0f;
if (tid < 8) {
int index = thread_indices[0];
value = sigmoid_func(data_input[index]);
}
typedef cub::WarpReduce<float, warp_threads> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
// 使用有效项group 进行部分归约
float warp_sum = WarpReduce(temp_storage).Sum(value);
if (0 == tid) {
share_sum = warp_sum + 1e-20;
}
__syncwarp();
if (tid < 8) {
int index = thread_indices[0];
indices_topk_output[tid] = index;
values_topk_output[tid] = routed_scaling_factor * value / share_sum;
}
}
}
#endif // _topkrouter_KERNEL_CUH__
#ifndef __topkrouter_INFO_H__
#define __topkrouter_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace op::topkrouter {
class TopkrouterInfo {
TopkrouterInfo() = default;
public:
infiniDtype_t xtype;
std::vector<size_t> shape;
std::vector<ptrdiff_t> x_strides;
size_t N;
size_t width;
public:
size_t ndim() const { return shape.size(); }
size_t dim() const { return shape[ndim() - 1]; }
static utils::Result<TopkrouterInfo> create(infiniopTensorDescriptor_t x_desc) {
auto xtype = x_desc->dtype();
if ((xtype != infiniDtype_t::INFINI_DTYPE_F32) && (xtype != infiniDtype_t::INFINI_DTYPE_F16) && (xtype != infiniDtype_t::INFINI_DTYPE_BF16)) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
size_t N = x_desc->shape()[0]; // token数量
size_t width = x_desc->shape()[1]; // 专家数量
if (x_desc->ndim() != 2) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
return utils::Result<TopkrouterInfo>(TopkrouterInfo{xtype, x_desc->shape(), x_desc->strides(), N, width});
}
};
} // namespace op::topkrouter
#endif // __Topkrouter_INFO_H__
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "topkrouter_nvidia.cuh"
#include <cub/block/block_reduce.cuh>
namespace op::topkrouter::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t correction_bias_desc) {
auto result = TopkrouterInfo::create(x_desc);
CHECK_RESULT(result);
auto info = result.take();
if (info.x_strides[1] != 1) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <int BLOCK_SIZE = 128>
infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, void *d_input, float *d_correction_bias, float routed_scaling_factor,
size_t N, size_t width, size_t topk, infiniDtype_t xtype, cudaStream_t stream) {
const int block_threads = BLOCK_SIZE;
dim3 blocks(N);
dim3 threads(block_threads);
if (xtype == INFINI_DTYPE_F32) {
topkrouter_kernel<float, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (float *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
} else if (xtype == INFINI_DTYPE_F16) {
topkrouter_kernel<half, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (half *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
} else if (xtype == INFINI_DTYPE_BF16) {
topkrouter_kernel<__nv_bfloat16, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (__nv_bfloat16 *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
}; // namespace
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
float *values, int *indices, void *x, float *correction_bias, float routed_scaling_factor, size_t topk, void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
size_t N = _info.N;
size_t width = _info.width; // 256
// size_t n_routed_experts = 256;
// size_t n_group = 8;
// size_t topk_group = 4;
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
if (256 == width) {
launch_topkrouter<256>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, cuda_stream);
} else {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::topkrouter::nvidia
#ifndef __Topkrouter_CUDA_H__
#define __Topkrouter_CUDA_H__
#include "../topkrouter.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/topkrouter.h"
#ifdef ENABLE_CPU_API
#include "cpu/topkrouter_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API)
#include "nvidia/topkrouter_nvidia.cuh"
#endif
__C infiniStatus_t infiniopCreateTopkrouterDescriptor(
infiniopHandle_t handle,
infiniopTopkrouterDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t correction_bias_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::topkrouter::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), \
x_desc, correction_bias_desc)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef CREATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size,
void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, topk, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef CALCULATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef DESTROY
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#ifndef _Topkrouter_H_
#define _Topkrouter_H_
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::topkrouter::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
TopkrouterInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
TopkrouterInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t correction_bias_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
float *values, \
int *indices, \
void *x, \
float *correction_bias, \
float routed_scaling_factor, \
size_t topk, \
void *stream) const; \
}; \
}
#endif // _Topkrouter_H_
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None),
(1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
(1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
(1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 0, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
InfiniDtype.BF16: {"atol": 0, "rtol": 5e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
# PyTorch implementation for matrix multiplication
def gemm(d, _c, beta, _a, _b, alpha):
try:
if _c.ndim == 2:
torch.addmm(_c, _a, _b, beta=beta, alpha=alpha, out=d)
elif _c.ndim == 3:
torch.baddbmm(_c, _a, _b, beta=beta, alpha=alpha, out=d)
else:
raise
except Exception:
torch.matmul(_a, _b, out=d)
d.mul_(alpha).add_(_c, alpha=beta)
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def test(
handle,
device,
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride=None,
b_stride=None,
c_stride=None,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Gemm on {InfiniDeviceNames[device]} with alpha:{alpha}, beta:{beta},"
f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{InfiniDtypeNames[dtype]}"
)
qweight = TestTensor((8192, 256), None, InfiniDtype.I32, device, mode="randint")
scales = TestTensor((64, 2048), None, InfiniDtype.F16, device)
zeros = TestTensor((64, 256), None, InfiniDtype.I32, device, mode="zeros")
out = TestTensor((8192, 2048), None, InfiniDtype.F16, device, mode="zeros")
print(out.actual_tensor())
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateDequantizeDescriptor(
handle,
ctypes.byref(descriptor),
out.descriptor,
qweight.descriptor,
scales.descriptor,
zeros.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
# for tensor in [a, b, c]:
# tensor.destroy_desc()
# Get workspace size and create workspace
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetDequantizeWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, device)
# Execute infiniop gemm operator
def lib_dequantize():
check_error(
LIBINFINIOP.infiniopDequantize(
descriptor,
workspace.data(),
workspace_size.value,
out.data(),
qweight.data(),
scales.data(),
zeros.data(),
0,
0,
0,
None,
)
)
lib_dequantize()
print(out.actual_tensor())
# # Validate results
# atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
# if DEBUG:
# debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
# assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
# # Profiling workflow
# if PROFILE:
# # fmt: off
# profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
# profile_operation(" lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
# # fmt: on
# check_error(LIBINFINIOP.infiniopDestroyDequantizeDescriptor(descriptor))
# ==============================================================================
# Main Execution
# ==============================================================================
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
# Execute tests
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -387,6 +387,42 @@ def rope_(lib):
]
@OpRegister.operator
def rope_v2_(lib):
lib.infiniopCreateRoPEv2Descriptor.restype = c_int32
lib.infiniopCreateRoPEv2Descriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetRoPEv2WorkspaceSize.restype = c_int32
lib.infiniopGetRoPEv2WorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRoPEv2.restype = c_int32
lib.infiniopRoPEv2.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRoPEv2Descriptor.restype = c_int32
lib.infiniopDestroyRoPEv2Descriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def sub_(lib):
lib.infiniopCreateSubDescriptor.restype = c_int32
......@@ -489,3 +525,74 @@ def conv_(lib):
lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def topkrouter_(lib):
lib.infiniopCreateTopkrouterDescriptor.restype = c_int32
lib.infiniopCreateTopkrouterDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t
]
lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32
lib.infiniopGetTopkrouterWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopTopkrouter.restype = c_int32
lib.infiniopTopkrouter.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_size_t,
c_void_p,
]
lib.infiniopDestroyTopkrouterDescriptor.restype = c_int32
lib.infiniopDestroyTopkrouterDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def dequantize_(lib):
lib.infiniopCreateDequantizeDescriptor.restype = c_int32
lib.infiniopCreateDequantizeDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetDequantizeWorkspaceSize.restype = c_int32
lib.infiniopGetDequantizeWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopDequantize.restype = c_int32
lib.infiniopDequantize.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_size_t,
c_size_t,
c_size_t,
c_void_p,
]
lib.infiniopDestroyDequantizeDescriptor.restype = c_int32
lib.infiniopDestroyDequantizeDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
\ No newline at end of file
......@@ -78,6 +78,8 @@ class TestTensor(CTensor):
self._torch_tensor = torch.ones(
torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
)
elif mode == "randint":
self._torch_tensor = torch.randint(-2000000000,2000000000, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
elif mode == "manual":
assert set_tensor is not None
assert torch_shape == list(set_tensor.shape)
......
......@@ -37,7 +37,7 @@ _TEST_CASES_ = [
# w (weight) types
# Note: 'None' means the same as input dtype
_WEIGHT_DTYPES = [None, InfiniDtype.F32]
_WEIGHT_DTYPES = [None, InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
# x types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment