Commit ed530e11 authored by pengcheng888's avatar pengcheng888
Browse files

issue/427 - the sigmoid, topksoftmax, and topkrouter ops

parent 3959c943
#ifndef __Topkrouter_CUDA_H__
#define __Topkrouter_CUDA_H__
#ifndef __TOPKROUTER_CUDA_H__
#define __TOPKROUTER_CUDA_H__
#include "../topkrouter.h"
......
......@@ -9,18 +9,13 @@
#include "nvidia/topkrouter_nvidia.cuh"
#endif
__C infiniStatus_t infiniopCreateTopkrouterDescriptor(
infiniopHandle_t handle,
infiniopTopkrouterDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t correction_bias_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::topkrouter::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), \
x_desc, correction_bias_desc)
__C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, infiniopTopkrouterDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t correction_bias_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::topkrouter::NAMESPACE::Descriptor::create( \
handle, reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), x_desc, correction_bias_desc)
switch (handle->device) {
#ifdef ENABLE_CPU_API
......@@ -37,7 +32,6 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(
}
__C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
......@@ -58,12 +52,13 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
}
__C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size,
void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, topk, stream)
void *values, void *indices, const void *x, const void *correction_bias,
const float routed_scaling_factor, const size_t topk, void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, \
topk, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
......@@ -80,7 +75,6 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
}
__C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc); \
......
#ifndef _Topkrouter_H_
#define _Topkrouter_H_
#ifndef _TOPKTOUTER_H_
#define _TOPKTOUTER_H_
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::topkrouter::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
TopkrouterInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
TopkrouterInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t correction_bias_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
float *values, \
int *indices, \
void *x, \
float *correction_bias, \
float routed_scaling_factor, \
size_t topk, \
void *stream) const; \
}; \
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::topkrouter::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
TopkrouterInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, \
TopkrouterInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) { \
} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { \
return _workspace_size; \
} \
\
static infiniStatus_t create(infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t correction_bias_desc); \
\
infiniStatus_t calculate(void *workspace, \
size_t workspace_size, \
float *values, \
int *indices, \
const void *x, \
const float *correction_bias, \
const float routed_scaling_factor, \
const size_t topk, \
void *stream) const; \
}; \
}
#endif // _Topkrouter_H_
#include "topksoftmax_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include "topksoftmax_cpu.h"
#include <algorithm>
namespace op::topksoftmax::cpu {
Descriptor::~Descriptor() {
}
infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc) {
auto result = TopksoftmaxInfo::create(x_desc);
CHECK_RESULT(result);
auto info = result.take();
if (info.x_strides[1] != 1) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
*desc_ptr = new Descriptor(nullptr, std::move(info), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
void topksoftmax_cpu_one_token(float *values_input, // 输出数据
int *indices_input, // 输出索引
std::vector<std::pair<float, size_t>> &value_index_arr, // 输入数据
size_t topk,
bool norm,
size_t width) {
// ------------------------------------------------ //
// 第一步:计算最大值 //
// ------------------------------------------------ //
float value_max = value_index_arr[0].first;
for (size_t i = 1; i < width; ++i) {
value_max = value_index_arr[i].first > value_max ? value_index_arr[i].first : value_max;
}
// ------------------------------------------------ //
// 第二步: 指数计算 //
// ------------------------------------------------ //
float exp_sum = 0.0f;
for (size_t i = 0; i < width; ++i) {
float value = std::exp(value_index_arr[i].first - value_max);
value_index_arr[i].first = value;
exp_sum += value;
}
// ------------------------------------------------ //
// 第三步:计算 Softmax //
// ------------------------------------------------ //
for (size_t i = 0; i < width; ++i) {
value_index_arr[i].first /= exp_sum;
}
// ------------------------------------------------ //
// 第四步:计算 排序 //
// ------------------------------------------------ //
std::sort(value_index_arr.begin(), value_index_arr.end(),
[](const std::pair<float, size_t> &a, const std::pair<float, size_t> &b) { return a.first > b.first; });
// ------------------------------------------------ //
// 第五步: topk //
// ------------------------------------------------ //
exp_sum = 0.0f;
for (size_t i = 0; i < topk; ++i) {
values_input[i] = value_index_arr[i].first;
indices_input[i] = static_cast<int>(value_index_arr[i].second);
exp_sum += values_input[i];
}
// ------------------------------------------------ //
// 第6步: norm归一化 //
// ------------------------------------------------ //
if (norm) {
for (size_t i = 0; i < topk; ++i) {
values_input[i] /= exp_sum;
}
}
}
template <typename T>
infiniStatus_t topksoftmax_cpu_func(float *values, int *indices,
const T *x,
size_t topk, bool norm, size_t N, size_t width) {
/*
O-----------> width 地址连续
|
|
N
*/
for (size_t n = 0; n < N; ++n) {
float *values_input = values + n * topk;
int *indices_input = indices + n * topk;
const T *x_input = x + n * width;
std::vector<std::pair<float, size_t>> value_index_arr;
value_index_arr.resize(width);
// ------------------------------------------------ //
// 第0步: 数据先转换到 float //
// ------------------------------------------------ //
float temp;
for (size_t i = 0; i < width; ++i) {
if constexpr (std::is_same<T, fp16_t>::value) {
temp = _f16_to_f32(x_input[i]);
} else if constexpr (std::is_same<T, bf16_t>::value) {
temp = _bf16_to_f32(x_input[i]);
} else {
temp = x_input[i];
}
value_index_arr[i] = {temp, i};
}
topksoftmax_cpu_one_token(values_input,
indices_input,
value_index_arr,
topk,
norm,
width);
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x,
const size_t topk, const bool norm, void *stream) const {
size_t N = _info.N;
size_t width = _info.width;
if (_info.xtype == INFINI_DTYPE_F32) {
topksoftmax_cpu_func<float>(values, indices, (const float *)x, topk, norm, N, width);
} else if (_info.xtype == INFINI_DTYPE_F16) {
topksoftmax_cpu_func<fp16_t>(values, indices, (const fp16_t *)x, topk, norm, N, width);
} else if (_info.xtype == INFINI_DTYPE_BF16) {
topksoftmax_cpu_func<bf16_t>(values, indices, (const bf16_t *)x, topk, norm, N, width);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::topksoftmax::cpu
#ifndef __TOPKSOFTMAX_CPU_H__
#define __TOPKSOFTMAX_CPU_H__
#include "../topksoftmax.h"
DESCRIPTOR(cpu)
#endif
#ifndef _TOPKSOFTMAX_KERNEL_CUH__
#define _TOPKSOFTMAX_KERNEL_CUH__
#include <cfloat>
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
template <typename T>
inline __device__ float exp_func(T x) {
float data;
if constexpr (std::is_same_v<T, float>) {
data = x;
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
data = __bfloat162float(x);
} else if constexpr (std::is_same_v<T, half>) {
data = __half2float(x);
}
return __expf(data);
}
template <typename T, int BLOCK_SIZE = 128>
__global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形状[N, topk]
int *indices_topk, // 输出索引, 形状[N, topk]
const T *input, // 输入数据 [N, width]
const size_t N,
const size_t width,
const size_t topk,
bool norm
) {
const int bid = blockIdx.x;
if (bid >= N) {
return;
}
const int tid = threadIdx.x;
const T *data_input = input + bid * width;
float *values_topk_output = values_topk + bid * topk;
int *indices_topk_output = indices_topk + bid * topk;
const int warp_id = tid / 32;
__shared__ T shared_max;
__shared__ float shared_sum;
typedef cub::BlockReduce<T, BLOCK_SIZE> BlockReduce;
// ------------------------------------------------ //
// 第一步:计算最大值 //
// ------------------------------------------------ //
T thread_max = data_input[0];
if (tid < width) {
thread_max = thread_max > data_input[tid] ? thread_max : data_input[tid];
}
{
__shared__ typename BlockReduce::TempStorage temp_storage_max;
T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, cub::Max());
if (tid == 0) {
shared_max = value_max;
}
}
__syncthreads();
// ------------------------------------------------ //
// 第二步:计算指数和 //
// ------------------------------------------------ //
float exp_val = 0.0f;
if (tid < width) {
T temp_val = data_input[tid] - shared_max;
exp_val = exp_func<T>(temp_val);
}
{
__shared__ typename BlockReduce::TempStorage temp_storage_sum;
float value_sum = BlockReduce(temp_storage_sum).Sum(exp_val);
if (tid == 0) {
shared_sum = value_sum;
}
}
__syncthreads();
// ------------------------------------------------ //
// 第三步:计算 Softmax //
// ------------------------------------------------ //
exp_val /= shared_sum;
// ------------------------------------------------ //
// 第四步:计算 排序 //
// ------------------------------------------------ //
float thread_values[1] = {-FLT_MAX};
int thread_indices[1] = {-1};
if ((tid < width) && (exp_val > thread_values[0])) {
thread_values[0] = exp_val;
thread_indices[0] = tid;
}
{
typedef cub::BlockRadixSort<float, BLOCK_SIZE, 1, int> BlockRadixSort;
__shared__ typename BlockRadixSort::TempStorage temp_storage;
BlockRadixSort(temp_storage).SortDescending(thread_values, thread_indices);
}
__syncthreads();
if (0 == warp_id) {
int indice = -1;
float value = 0.0f;
if (tid < topk) {
indice = thread_indices[0];
value = thread_values[0];
}
// ------------------------------------------------ //
// 第五步: topk的和 //
// ------------------------------------------------ //
{
typedef cub::WarpReduce<float, 32> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
float warp_sum = WarpReduce(temp_storage).Sum(value);
if (0 == tid) {
shared_sum = warp_sum + 1e-9f;
}
}
__syncwarp();
// ------------------------------------------------ //
// 第6步: norm归一化 //
// ------------------------------------------------ //
if (norm && (tid < topk)) {
value /= shared_sum;
}
// ------------------------------------------------ //
// 第7步: 最终的返回值 //
// ------------------------------------------------ //
if (tid < topk) {
values_topk_output[tid] = value;
indices_topk_output[tid] = indice;
}
}
}
#endif // _TOPKSOFTMAX_KERNEL_CUH__
#ifndef __TOPKSOFTMAX_INFO_H__
#define __TOPKSOFTMAX_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace op::topksoftmax {
class TopksoftmaxInfo {
TopksoftmaxInfo() = default;
public:
infiniDtype_t xtype;
std::vector<size_t> shape;
std::vector<ptrdiff_t> x_strides;
size_t N;
size_t width;
public:
size_t ndim() const { return shape.size(); }
size_t dim() const { return shape[ndim() - 1]; }
static utils::Result<TopksoftmaxInfo> create(infiniopTensorDescriptor_t x_desc) {
auto xtype = x_desc->dtype();
if ((xtype != infiniDtype_t::INFINI_DTYPE_F32) && (xtype != infiniDtype_t::INFINI_DTYPE_F16) && (xtype != infiniDtype_t::INFINI_DTYPE_BF16)) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
if (x_desc->ndim() != 2) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
size_t N = x_desc->shape()[0]; // token数量
size_t width = x_desc->shape()[1]; // 专家数量
return utils::Result<TopksoftmaxInfo>(TopksoftmaxInfo{xtype,
x_desc->shape(),
x_desc->strides(),
N,
width});
}
};
} // namespace op::topksoftmax
#endif // __TOPKSOFTMAX_INFO_H__
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "topksoftmax_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
namespace op::topksoftmax::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_desc) {
auto result = TopksoftmaxInfo::create(x_desc);
CHECK_RESULT(result);
auto info = result.take();
if (info.x_strides[1] != 1) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <int BLOCK_SIZE = 128>
infiniStatus_t launch_topksoftmax(float *d_values_out, int *d_indices_out, const void *d_input, const size_t N, const size_t width, const size_t topk, const bool norm, infiniDtype_t xtype, cudaStream_t stream) {
const int block_threads = BLOCK_SIZE;
dim3 blocks(static_cast<unsigned int>(N));
dim3 threads(block_threads);
if (xtype == INFINI_DTYPE_F32) {
softmax_topk_row_kernel<float, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (float *)d_input, N, width, topk, norm);
} else if (xtype == INFINI_DTYPE_F16) {
softmax_topk_row_kernel<half, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (half *)d_input, N, width, topk, norm);
} else if (xtype == INFINI_DTYPE_BF16) {
softmax_topk_row_kernel<__nv_bfloat16, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (__nv_bfloat16 *)d_input, N, width, topk, norm);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
}; // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
float *values,
int *indices,
const void *x,
const size_t topk,
const bool norm,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
size_t N = _info.N;
size_t width = _info.width;
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
if (width <= 128) {
launch_topksoftmax<128>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream);
} else if (width <= 256) {
launch_topksoftmax<256>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream);
} else if (width <= 512) {
launch_topksoftmax<512>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream);
} else {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::topksoftmax::nvidia
#ifndef __TOPKSOFTMAX_CUDA_H__
#define __TOPKSOFTMAX_CUDA_H__
#include "../topksoftmax.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/topksoftmax.h"
#ifdef ENABLE_CPU_API
#include "cpu/topksoftmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API)
#include "nvidia/topksoftmax_nvidia.cuh"
#endif
__C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
infiniopTopksoftmaxDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::topksoftmax::NAMESPACE::Descriptor::create( \
handle, reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor **>(desc_ptr), x_desc)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef CREATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, void *workspace, size_t workspace_size,
void *values, void *indices, const void *x, const size_t topk, const int norm,
void *stream) {
if (topk > 32) {
return INFINI_STATUS_BAD_PARAM;
}
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, (float *)values, (int *)indices, x, topk, static_cast<bool>(norm), stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef CALCULATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
}
#undef DESTROY
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#ifndef _TOPKSOFTMAX_H_
#define _TOPKSOFTMAX_H_
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::topksoftmax::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
TopksoftmaxInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, TopksoftmaxInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, _opaque(opaque), _info(info), _workspace_size(workspace_size) { \
} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { \
return _workspace_size; \
} \
\
static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc); \
\
infiniStatus_t calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x, \
const size_t topk, const bool norm, void *stream) const; \
}; \
}
#endif // _TOPKSOFTMAX_H_
import numpy as np
from numpy.lib.stride_tricks import as_strided
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
def sigmoid(
x: np.ndarray,
):
return 1 / (1 + np.exp(-x))
def random_tensor(shape, dtype):
rate = 1e-3
var = 0.5 * rate
return rate * np.random.rand(*shape).astype(dtype) - var
def process_tensors(a, b, stride_a=None, stride_b=None):
def normalize_stride(tensor, stride):
if stride:
slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
return tensor[slices]
else:
return tensor
a_unique = normalize_stride(a, stride_a)
b_unique = normalize_stride(b, stride_b)
return a_unique, b_unique
def process_tensor(a, stride_a=None):
def normalize_stride(tensor, stride):
if stride:
slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
return tensor[slices]
else:
return tensor
a_unique = normalize_stride(a, stride_a)
return a_unique
class SigmoidTestCase(InfiniopTestCase):
def __init__(
self,
x: np.ndarray,
shape_x: List[int] | None,
stride_x: List[int] | None,
y: np.ndarray,
shape_y: List[int] | None,
stride_y: List[int] | None,
):
super().__init__("sigmoid")
self.x = x
self.shape_x = shape_x
self.stride_x = stride_x
self.y = y
self.shape_y = shape_y
self.stride_y = stride_y
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_x is not None:
test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
if self.shape_y is not None:
test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
if self.stride_x is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
test_writer.add_array(
test_writer.gguf_key("y.strides"),
gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
)
test_writer.add_tensor(
test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
)
input_x = self.x.astype(np.float64)
if (self.stride_x is not None) and (0 in self.stride_x):
typesize = np.dtype(input_x.dtype).itemsize
new_strides_bytes = tuple(x * typesize for x in self.stride_x)
input_x = as_strided(x=input_x, shape=self.shape_x, strides=new_strides_bytes)
ans = sigmoid(input_x)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == '__main__':
test_writer = InfiniopTestWriter("sigmoid.gguf")
test_cases = []
_TEST_CASES_ = [
# shape, x_stride, y_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4), (0, 1), None),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), None),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
_TENSOR_DTYPES_ = [np.float16, np.float32]
for dtype in _TENSOR_DTYPES_:
for shape, stride_x, stride_y in _TEST_CASES_:
x = np.random.rand(*shape).astype(dtype)
y = np.empty(tuple(0 for _ in shape), dtype=dtype)
x = process_zero_stride_tensor(x, stride_x)
test_case = SigmoidTestCase(x=x,
shape_x=shape,
stride_x=stride_x,
y=y,
shape_y=shape,
stride_y=stride_y)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
import numpy as np
from typing import List
import torch
import torch.nn as nn
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
class DeepseekV3TopkRouter(nn.Module):
def __init__(self, correction_bias,
routed_scaling_factor: float,
topk: int, config=None):
super().__init__()
self.config = config
self.top_k = 8 # config.num_experts_per_tok
self.n_routed_experts = 256 # config.n_routed_experts
self.routed_scaling_factor = 2.5 # config.routed_scaling_factor
self.n_group = 8 # config.n_group
self.topk_group = 4 # config.topk_group
self.norm_topk_prob = True # config.norm_topk_prob
self.routed_scaling_factor = routed_scaling_factor
self.top_k = topk
# self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
# self.weight = torch.rand(256, 7168) * 2 - 1
# self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
self.e_score_correction_bias = torch.zeros(256, )
self.e_score_correction_bias[:] = correction_bias[:]
@torch.no_grad()
def get_topk_indices(self, scores):
scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) # Size([1, 256])
group_scores = (
scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
.topk(2, dim=-1)[0]
.sum(dim=-1)
)
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=True)[1] # Size([1, 4])
group_mask = torch.zeros_like(group_scores) # Size([1, 8])
group_mask.scatter_(1, group_idx, 1) # Size([1, 8])
score_mask = (
group_mask.unsqueeze(-1)
.expand(-1, self.n_group, self.n_routed_experts // self.n_group)
.reshape(-1, self.n_routed_experts)
)
scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # Size([1, 256])
topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=True)[1] # Size([1, 8])
return topk_indices
def forward(self, router_logits):
# hidden_states = hidden_states.view(-1, 7168)
# router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
scores = router_logits.sigmoid() # (1,256)
scores = scores.to(torch.float32)
topk_indices = self.get_topk_indices(scores) # (1,8)
topk_weights = scores.gather(1, topk_indices)
if self.norm_topk_prob:
denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
topk_weights /= denominator
topk_weights = topk_weights * self.routed_scaling_factor
return topk_indices, topk_weights
def python_topkrouter(x: np.ndarray,
correction_bias: np.ndarray,
routed_scaling_factor: float,
topk: int):
x = torch.from_numpy(x)
correction_bias = torch.from_numpy(correction_bias)
router_logits = x
lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias, routed_scaling_factor=routed_scaling_factor, topk=topk)(router_logits)
lable_indices = lable_indices.to(torch.int32)
return lable_values.numpy(), lable_indices.numpy()
class TopkrouterTestCase(InfiniopTestCase):
def __init__(self,
values: np.ndarray, # 传出参数
indices: np.ndarray, # 传出参数
x: np.ndarray, # 传入参数
correction_bias: np.ndarray, # 传入参数
routed_scaling_factor: float,
topk: int,
values_shape: List[int] | None,
values_strides: List[int] | None,
indices_shape: List[int] | None,
indices_strides: List[int] | None,
x_shape: List[int] | None,
x_strides: List[int] | None,
correction_bias_shape: List[int] | None,
correction_bias_strides: List[int] | None,
):
super().__init__("topkrouter")
self.values = values
self.indices = indices
self.x = x
self.correction_bias = correction_bias
self.routed_scaling_factor = routed_scaling_factor
self.topk = topk
self.values_shape = values_shape
self.values_strides = values_strides
self.indices_shape = indices_shape
self.indices_strides = indices_strides
self.x_shape = x_shape
self.x_strides = x_strides
self.correction_bias_shape = correction_bias_shape
self.correction_bias_strides = correction_bias_strides
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.values_shape is not None:
print("self.values_shape: ", self.values_shape)
test_writer.add_array(test_writer.gguf_key("values.shape"), self.values_shape)
if self.indices_shape is not None:
test_writer.add_array(test_writer.gguf_key("indices.shape"), self.indices_shape)
if self.x_shape is not None:
test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape)
if self.correction_bias_shape is not None:
test_writer.add_array(test_writer.gguf_key("correction_bias.shape"), self.correction_bias_shape)
if self.x_strides is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
if self.correction_bias_strides is not None:
test_writer.add_array(test_writer.gguf_key("correction_bias_strides.strides"), gguf_strides(*self.correction_bias_strides))
test_writer.add_array(
test_writer.gguf_key("values.strides"),
gguf_strides(*self.values_strides if self.values_strides is not None else contiguous_gguf_strides(self.values_shape))
)
test_writer.add_array(
test_writer.gguf_key("indices.strides"),
gguf_strides(*self.indices_strides if self.indices_strides is not None else contiguous_gguf_strides(self.indices_shape))
)
test_writer.add_tensor(test_writer.gguf_key("values"),
self.values,
raw_dtype=np_dtype_to_ggml(self.values.dtype))
test_writer.add_tensor(test_writer.gguf_key("indices"),
self.indices,
raw_dtype=np_dtype_to_ggml(self.indices.dtype))
test_writer.add_tensor(test_writer.gguf_key("x"),
self.x,
raw_dtype=np_dtype_to_ggml(self.x.dtype))
test_writer.add_tensor(test_writer.gguf_key("correction_bias"),
self.correction_bias,
raw_dtype=np_dtype_to_ggml(self.correction_bias.dtype))
test_writer.add_float32(test_writer.gguf_key("routed_scaling_factor"), self.routed_scaling_factor)
test_writer.add_int32(test_writer.gguf_key("topk"), self.topk)
lable_values, lable_indices = python_topkrouter(self.x.copy(), self.correction_bias.copy(), self.routed_scaling_factor, self.topk)
test_writer.add_tensor(
test_writer.gguf_key("lable_values"),
lable_values,
raw_dtype=np_dtype_to_ggml(lable_values.dtype),
)
test_writer.add_tensor(
test_writer.gguf_key("lable_indices"),
lable_indices,
raw_dtype=np_dtype_to_ggml(lable_indices.dtype)
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("topkrouter.gguf")
test_cases = []
_TEST_CASES_ = [
# x_shape, x_strides, correction_bias_shape, correction_bias_stride, routed_scaling_factor, topk
((1, 256), None, (256,), None, 2.5, 8),
((2, 256), None, (256,), None, 1.5, 8),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for x_shape, x_strides, correction_bias_shape, b_stride, routed_scaling_factor, topk in _TEST_CASES_:
ntoken = x_shape[0]
values_indices_shape = (ntoken, topk)
values = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.float32)
indices = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.int32)
x = np.random.rand(*x_shape).astype(dtype)
correction_bias = np.random.rand(*correction_bias_shape).astype(np.float32)
test_case = TopkrouterTestCase(
values=values,
indices=indices,
x=x,
correction_bias=correction_bias,
routed_scaling_factor=routed_scaling_factor,
topk=topk,
values_shape=list(values_indices_shape),
values_strides=None,
indices_shape=list(values_indices_shape),
indices_strides=None,
x_shape=list(x_shape),
x_strides=None,
correction_bias_shape=list(correction_bias_shape),
correction_bias_strides=None,
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
import numpy as np
from typing import List
import torch
import torch.nn as nn
import torch.nn.functional as F
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
def torch_Topksoftmax(router_logits, top_k: int, norm_topk_prob: bool):
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
if norm_topk_prob: # only diff with mixtral sparse moe block!
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
routing_weights = routing_weights.to(torch.float)
return routing_weights, selected_experts
def python_Topksoftmax(router_logits, top_k: int, norm_topk_prob: bool):
router_logits = torch.from_numpy(router_logits)
lable_values, lable_indices = torch_Topksoftmax(router_logits, top_k, norm_topk_prob)
return lable_values.numpy(), lable_indices.numpy()
class TopksoftmaxTestCase(InfiniopTestCase):
def __init__(self,
values: np.ndarray, # 传出参数
indices: np.ndarray, # 传出参数
x: np.ndarray, # 传入参数
topk: np.ndarray,
norm: bool,
values_shape: List[int] | None,
values_strides: List[int] | None,
indices_shape: List[int] | None,
indices_strides: List[int] | None,
x_shape: List[int] | None,
x_strides: List[int] | None,
):
super().__init__("topksoftmax")
self.values = values
self.indices = indices
self.x = x
self.topk = topk
self.norm = norm
self.values_shape = values_shape
self.values_strides = values_strides
self.indices_shape = indices_shape
self.indices_strides = indices_strides
self.x_shape = x_shape
self.x_strides = x_strides
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.values_shape is not None:
print("self.values_shape: ", self.values_shape)
test_writer.add_array(test_writer.gguf_key("values.shape"), self.values_shape)
if self.indices_shape is not None:
test_writer.add_array(test_writer.gguf_key("indices.shape"), self.indices_shape)
if self.x_shape is not None:
test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape)
if self.x_strides is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
test_writer.add_array(
test_writer.gguf_key("values.strides"),
gguf_strides(*self.values_strides if self.values_strides is not None else contiguous_gguf_strides(self.values_shape))
)
test_writer.add_array(
test_writer.gguf_key("indices.strides"),
gguf_strides(*self.indices_strides if self.indices_strides is not None else contiguous_gguf_strides(self.indices_shape))
)
test_writer.add_tensor(test_writer.gguf_key("values"),
self.values,
raw_dtype=np_dtype_to_ggml(self.values.dtype))
test_writer.add_tensor(test_writer.gguf_key("indices"),
self.indices,
raw_dtype=np_dtype_to_ggml(self.indices.dtype))
test_writer.add_tensor(test_writer.gguf_key("x"),
self.x,
raw_dtype=np_dtype_to_ggml(self.x.dtype))
test_writer.add_int32(test_writer.gguf_key("topk"), self.topk)
test_writer.add_bool(test_writer.gguf_key("norm"), self.norm)
lable_values, lable_indices = python_Topksoftmax(self.x.copy(), self.topk, self.norm)
test_writer.add_tensor(
test_writer.gguf_key("lable_values"),
lable_values,
raw_dtype=np_dtype_to_ggml(lable_values.dtype),
)
test_writer.add_tensor(
test_writer.gguf_key("lable_indices"),
lable_indices,
raw_dtype=np_dtype_to_ggml(lable_indices.dtype)
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("topksoftmax.gguf")
test_cases = []
_TEST_CASES_ = [
# x_shape, x_strides, topk, norm
((1, 32), None, 4, True),
((8, 20), None, 8, False),
((2, 128), None, 10, True)
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for x_shape, x_strides, topk, norm in _TEST_CASES_:
ntoken = x_shape[0]
values_indices_shape = (ntoken, topk)
values = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.float32)
indices = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.int32)
x = np.random.rand(*x_shape).astype(dtype)
test_case = TopksoftmaxTestCase(
values=values,
indices=indices,
x=x,
topk=topk,
norm=norm,
values_shape=list(values_indices_shape),
values_strides=None,
indices_shape=list(values_indices_shape),
indices_strides=None,
x_shape=list(x_shape),
x_strides=None
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
......@@ -495,6 +495,66 @@ def conv_(lib):
]
@OpRegister.operator
def sigmoid_(lib):
lib.infiniopCreateSigmoidDescriptor.restype = c_int32
lib.infiniopCreateSigmoidDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSigmoidWorkspaceSize.restype = c_int32
lib.infiniopGetSigmoidWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSigmoid.restype = c_int32
lib.infiniopSigmoid.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySigmoidDescriptor.restype = c_int32
lib.infiniopDestroySigmoidDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def topksoftmax_(lib):
lib.infiniopCreateTopksoftmaxDescriptor.restype = c_int32
lib.infiniopCreateTopksoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetTopksoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetTopksoftmaxWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopTopksoftmax.restype = c_int32
lib.infiniopTopksoftmax.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_size_t,
c_int32,
c_void_p,
]
lib.infiniopDestroyTopksoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyTopksoftmaxDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def topkrouter_(lib):
lib.infiniopCreateTopkrouterDescriptor.restype = c_int32
......@@ -502,7 +562,7 @@ def topkrouter_(lib):
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t
]
lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32
......@@ -510,7 +570,6 @@ def topkrouter_(lib):
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopTopkrouter.restype = c_int32
lib.infiniopTopkrouter.argtypes = [
infiniopOperatorDescriptor_t,
......@@ -524,7 +583,6 @@ def topkrouter_(lib):
c_size_t,
c_void_p,
]
lib.infiniopDestroyTopkrouterDescriptor.restype = c_int32
lib.infiniopDestroyTopkrouterDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
......
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4), (0, 1), (0, 1)),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (4, 0, 1)),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
((4, 4, 56320), None, None),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_X,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def torch_sigmoid(y, x):
torch.sigmoid(x, out=y)
def test(
handle,
device,
shape,
x_stride=None,
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
x = TestTensor(shape, x_stride, dtype, device)
if inplace == Inplace.INPLACE_X:
if x_stride != y_stride:
return
y = x
else:
y = TestTensor(shape, y_stride, dtype, device, mode="ones")
if y.is_broadcast():
return
print(
f"Testing Sigmoid on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
torch_sigmoid(y.torch_tensor(), x.torch_tensor())
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateSigmoidDescriptor(
handle,
ctypes.byref(descriptor),
y.descriptor,
x.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x, y]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetSigmoidWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, y.device)
def lib_sigmoid():
check_error(
LIBINFINIOP.infiniopSigmoid(
descriptor,
workspace.data(),
workspace.size(),
y.data(),
x.data(),
None,
)
)
lib_sigmoid()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch_sigmoid(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_sigmoid(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroySigmoidDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92m Test passed! \033[0m")
......@@ -27,14 +27,13 @@ from libinfiniop import (
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# x_shape, x_stride, select_experts
((1, 256), None, 8),
((3, 256), None, 8),
# x_shape, x_stride, topk, routed_scaling_factor
((1, 256), None, 8, 2.5),
]
# w (weight) types
# Note: 'None' means the same as input dtype
_X_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16] #
_X_DTYPES = [] # [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
# x types used for testing
_VALUE_DTYPES = [InfiniDtype.F32]
......@@ -46,6 +45,8 @@ _TEST_CASES = [
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
......@@ -59,12 +60,13 @@ def tensorInfo(data):
class DeepseekV3TopkRouter(nn.Module):
def __init__(self, correction_bias, config=None):
def __init__(self, correction_bias, routed_scaling_factor: float = 2.5, topk: int = 8, config=None):
super().__init__()
self.config = config
self.top_k = 8 # config.num_experts_per_tok
self.top_k = topk # config.num_experts_per_tok 8
assert topk == 8
self.n_routed_experts = 256 # config.n_routed_experts
self.routed_scaling_factor = 2.5 # config.routed_scaling_factor
self.routed_scaling_factor = routed_scaling_factor # config.routed_scaling_factor 2.5
self.n_group = 8 # config.n_group
self.topk_group = 4 # config.topk_group
self.norm_topk_prob = True # config.norm_topk_prob
......@@ -73,7 +75,7 @@ class DeepseekV3TopkRouter(nn.Module):
# self.weight = torch.rand(256, 7168) * 2 - 1
# self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
self.e_score_correction_bias = torch.zeros(256, device="cuda")
self.e_score_correction_bias = torch.zeros(256, device=correction_bias.device)
self.e_score_correction_bias[:] = correction_bias[:]
@torch.no_grad()
......@@ -114,11 +116,12 @@ class DeepseekV3TopkRouter(nn.Module):
denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
topk_weights /= denominator
topk_weights = topk_weights * self.routed_scaling_factor
return topk_indices, topk_weights
def torch_topkrouter(router_logits, correction_bias):
lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias)(router_logits)
def torch_topkrouter(router_logits, correction_bias, routed_scaling_factor, topk):
lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias, routed_scaling_factor, topk)(router_logits)
lable_indices = lable_indices.to(torch.int32)
return lable_values, lable_indices
......@@ -129,12 +132,13 @@ def test(
x_shape,
x_stride,
topk,
routed_scaling_factor,
x_dtype=InfiniDtype.F32,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing topkrouter on {InfiniDeviceNames[device]} with x_shape:{x_shape}"
f"Testing topkrouter on {InfiniDeviceNames[device]} with x_shape:{x_shape} "
f"x_stride:{x_stride} w_dtype:{InfiniDtypeNames[x_dtype]} dtype:{InfiniDtypeNames[dtype]}"
)
......@@ -182,27 +186,27 @@ def test(
indices.data_ptr(),
x.data(),
correction_bias.data(),
2.5,
routed_scaling_factor,
topk,
None,
)
)
lib_topkrouter()
lable_values, lable_indices = torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor())
lable_values, lable_indices = torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor(), routed_scaling_factor, topk)
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(lable_values, values, atol=atol, rtol=rtol)
debug(lable_indices, indices, atol=atol, rtol=rtol)
assert torch.allclose(lable_values, values, atol=atol, rtol=rtol)
assert torch.allclose(lable_indices, lable_indices, atol=atol, rtol=rtol)
assert torch.allclose(lable_indices, indices, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch_topkrouter(x.actual_tensor().clone(), tokp), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor(), routed_scaling_factor, topk), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_topkrouter(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyTopkrouterDescriptor(descriptor))
......
import torch
import ctypes
from ctypes import c_uint64
import torch.nn.functional as F
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
torch_device_map
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# x_shape, x_stride, topk, norm
((1, 10), None, 7, True),
((2, 20), None, 4, True),
((1, 128), None, 10, True),
]
# w (weight) types
# Note: 'None' means the same as input dtype
_X_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] #
# x types used for testing
_VALUE_DTYPES = [InfiniDtype.F32]
# Form the test cases by appending each element of _X_DTYPES to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (x_dtype,) for test_case in _TEST_CASES_ for x_dtype in _X_DTYPES
]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def tensorInfo(data):
print("data: ", data.is_contiguous(), data.device, data.dtype, data.shape, data.stride(), data.data_ptr(), hex(data.data_ptr()))
def torch_topksoftmax(router_logits, top_k, norm_topk_prob=False):
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
if norm_topk_prob: # only diff with mixtral sparse moe block!
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
return routing_weights, selected_experts
def test(
handle,
device,
x_shape,
x_stride,
topk,
norm_topk_prob,
x_dtype=InfiniDtype.F32,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing topksoftmax on {InfiniDeviceNames[device]} with x_shape:{x_shape}"
f"x_stride:{x_stride} w_dtype:{InfiniDtypeNames[x_dtype]} dtype:{InfiniDtypeNames[dtype]}"
)
data = torch.arange(0, x_shape[0] * x_shape[1]).reshape(x_shape)
N, width = x_shape
x = TestTensor(x_shape, data.stride(), x_dtype, device, scale=0.5, mode="manual", set_tensor=data)
# print(x.torch_tensor())
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateTopksoftmaxDescriptor(
handle,
ctypes.byref(descriptor),
x.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetTopksoftmaxWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, x.device)
values = torch.zeros((N, topk), dtype=torch.float32, device=torch_device_map[x.device])
indices = torch.zeros((N, topk), dtype=torch.int32, device=torch_device_map[x.device])
def lib_topksoftmax():
check_error(
LIBINFINIOP.infiniopTopksoftmax(
descriptor,
workspace.data(),
workspace_size.value,
values.data_ptr(),
indices.data_ptr(),
x.data(),
topk,
norm_topk_prob,
None,
)
)
lable_values, lable_indices = torch_topksoftmax(x.torch_tensor().clone(), topk, norm_topk_prob=norm_topk_prob)
lable_indices = lable_indices.to(dtype=torch.int32)
lib_topksoftmax()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(lable_values, values, atol=atol, rtol=rtol)
debug(lable_indices, indices, atol=atol, rtol=rtol)
assert torch.allclose(lable_values, values, atol=atol, rtol=rtol)
assert torch.allclose(lable_indices, indices, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch_topksoftmax(x.actual_tensor().clone(), topk), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_topksoftmax(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyTopksoftmaxDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
# Execute tests
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _VALUE_DTYPES)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment