Commit 81fe2ba3 authored by PanZezhong's avatar PanZezhong
Browse files

init

parents
---
BasedOnStyle: LLVM
IndentWidth: 4 # 缩进宽度,LLVM 默认值为 2,改为 4
AccessModifierOffset: -4 # public/protected/private 访问控制符相对成员的偏移,与 IndentWidth 配合,LLVM 默认值为 -2
AlignOperands: AlignAfterOperator # 双目运算符的行间对齐,LLVM 默认值为 Align,改为带符号一起换行
BreakBeforeBinaryOperators: All # 在双目运算符之前换行,LLVM 默认值为 None,改为换行时总是把双目运算符放在行首,包括赋值(=)
ColumnLimit: 0 # 列宽限制,LLVM 默认值为 80,改为不限制
AllowShortBlocksOnASingleLine: Always # 是否允许短块(单个语句的块)不换行,LLVM 默认值为 Never,改为允许
AllowShortLoopsOnASingleLine: true # 是否允许短循环不换行,LLVM 默认值为 false,改为允许
InsertBraces: true # 是否在 if/for/while/switch 等语句后插入大括号,LLVM 默认值为 false,改为允许
BreakBeforeBraces: Custom # 大括号换行配置,LLVM 默认值为 LLVM,改为自定义以使 BraceWrapping 生效
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
# Xmake cache
.xmake/
build/
# MacOS Cache
.DS_Store
# Vscode
.vscode/
# Python
__pycache__/
# Log
*.log
# Cache
cache/
# JSON
*.json
#GGUF
*.gguf
#ifndef INFINICORE_INFER_H
#define INFINICORE_INFER_H
#include "infinicore_infer/models/jiuge.h"
#endif /* INFINICORE_INFER_H */
#ifndef MODEL_JIUGE_H
#define MODEL_JIUGE_H
#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>
#include <stdint.h>
struct JiugeModel;
typedef struct
{
infiniDtype_t dt_logits, dt_norm, dt_mat;
size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
float epsilon, theta;
uint32_t end_token;
} JiugeMeta;
typedef struct
{
size_t nlayer;
// [dvoc, d]
const void *input_embd;
// [d]
const void *output_norm;
// [dvoc, d]
const void *output_embd;
// nlayer * [d]
const void *const *attn_norm;
// nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
const void *const *attn_qkv;
// nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
const void *const *attn_qkv_b;
// nlayer * [ndev, d, nkvh / ndev * dh]
const void *const *attn_o;
// nlayer * [d]
const void *const *ffn_norm;
// nlayer * [ndev, 2 * di / ndev, d]
const void *const *ffn_gate_up;
// nlayer * [ndev, d, di / ndev]
const void *const *ffn_down;
} JiugeWeights;
//////////////////// APIs ///////////////////////
/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
__C __export struct JiugeModel *
createJiugeModel(const JiugeMeta *,
const JiugeWeights *,
infiniDevice_t device,
int ndev,
const int *dev_ids);
/// @brief 销毁模型
__C __export void
destroyJiugeModel(struct JiugeModel *);
/// @brief 创建 KV Cache
__C __export struct KVCache *
createKVCache(const struct JiugeModel *);
/// @brief 复制 KV Cache
__C __export struct KVCache *
duplicateKVCache(const struct JiugeModel *,
const struct KVCache *, uint32_t seq_len);
/// @brief 销毁 KV Cache
__C __export void
dropKVCache(const struct JiugeModel *,
struct KVCache *);
/// @brief 文本生成
/// @param tokens 输入 token
/// @param ntok 输入 token 数量
/// @param req_pos 每个请求的起始位置
/// @param output 输出 token 地址
/// @param max_step 输出 token 最大数量
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
__C __export void
generate(struct JiugeModel *,
struct KVCache *,
const uint32_t *tokens, uint32_t ntok, uint32_t req_pos,
uint32_t *output, uint32_t max_step,
float temperature, uint32_t topk, float topp);
/// @brief 批次推理一轮
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param ans 输出 token 数组,每个请求一个输出,长度至少为nreq
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
__C __export void
inferBatch(struct JiugeModel *,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
uint32_t *output,
float temperature, uint32_t topk, float topp);
#endif
import ctypes
from ctypes import c_uint, c_int, c_float, c_void_p, POINTER
import os
class DataType(ctypes.c_int):
INFINI_DTYPE_INVALID = 0
INFINI_DTYPE_BYTE = 1
INFINI_DTYPE_BOOL = 2
INFINI_DTYPE_I8 = 3
INFINI_DTYPE_I16 = 4
INFINI_DTYPE_I32 = 5
INFINI_DTYPE_I64 = 6
INFINI_DTYPE_U8 = 7
INFINI_DTYPE_U16 = 8
INFINI_DTYPE_U32 = 9
INFINI_DTYPE_U64 = 10
INFINI_DTYPE_F8 = 11
INFINI_DTYPE_F16 = 12
INFINI_DTYPE_F32 = 13
INFINI_DTYPE_F64 = 14
INFINI_DTYPE_C16 = 15
INFINI_DTYPE_C32 = 16
INFINI_DTYPE_C64 = 17
INFINI_DTYPE_C128 = 18
INFINI_DTYPE_BF16 = 19
class DeviceType(ctypes.c_int):
DEVICE_TYPE_CPU = 0
DEVICE_TYPE_CUDA = 1
DEVICE_TYPE_CAMBRICON = 2
DEVICE_TYPE_ASCEND = 3
DEVICE_TYPE_METAX = 4
DEVICE_TYPE_MOORE = 5
class JiugeMeta(ctypes.Structure):
_fields_ = [
("dt_logits", DataType),
("dt_norm", DataType),
("dt_mat", DataType),
("nlayer", c_uint),
("d", c_uint),
("nh", c_uint),
("nkvh", c_uint),
("dh", c_uint),
("di", c_uint),
("dctx", c_uint),
("dvoc", c_uint),
("epsilon", c_float),
("theta", c_float),
("end_token", c_uint),
]
# Define the JiugeWeights struct
class JiugeWeights(ctypes.Structure):
_fields_ = [
("nlayer", c_uint),
("input_embd", c_void_p),
("output_norm", c_void_p),
("output_embd", c_void_p),
("attn_norm", POINTER(c_void_p)),
("attn_qkv", POINTER(c_void_p)),
("attn_qkv_b", POINTER(c_void_p)),
("attn_o", POINTER(c_void_p)),
("ffn_norm", POINTER(c_void_p)),
("ffn_gate_up", POINTER(c_void_p)),
("ffn_down", POINTER(c_void_p)),
]
class JiugeModel(ctypes.Structure):
pass
class KVCache(ctypes.Structure):
pass
def open_library():
lib_path = os.path.join(
os.environ.get("INFINI_ROOT"), "lib", "libinfinicore_infer.so"
)
lib = ctypes.CDLL(lib_path)
lib.create_JiugeModel.restype = POINTER(JiugeModel)
lib.create_JiugeModel.argtypes = [
POINTER(JiugeMeta), # JiugeMeta const *
POINTER(JiugeWeights), # JiugeWeights const *
DeviceType, # DeviceType
c_int, # int ndev
POINTER(c_int), # int const *dev_ids
]
lib.create_kv_cache.restype = POINTER(KVCache)
lib.drop_kv_cache.argtypes = [ctypes.POINTER(JiugeModel), POINTER(KVCache)]
lib.inferBatch.restype = None
lib.inferBatch.argtypes = [
ctypes.POINTER(JiugeModel), # struct JiugeModel const *
POINTER(c_uint), # unsigned int const *tokens
c_uint, # unsigned int ntok
POINTER(c_uint), # unsigned int const *req_lens
c_uint, # unsigned int nreq
POINTER(c_uint), # unsigned int const *req_pos
POINTER(POINTER(KVCache)), # struct KVCache **kv_caches
POINTER(c_uint), # unsigned int *output
c_float, # float temperature
c_uint, # unsigned int topk
c_float, # float topp
]
return lib
#include "jiuge_impl.hpp"
#include "jiuge_weight.hpp"
#include "../../tensor.hpp"
#include "../../utils.hpp"
#include "infinicore_infer.h"
#include <random>
#include <thread>
#include <vector>
void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
const JiugeWeights *weights,
infiniDevice_t device, int idev,
int ndev, int dev_id,
infinicclComm_t comm) {
RUN_INFINI(infinirtSetDevice(device, dev_id));
infiniopHandle_t handle;
infiniopCreateHandle(&handle);
infinirtStream_t stream;
infinirtStreamCreate(&stream);
std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
w_ffn_norm, w_ffn_gate_up, w_ffn_down;
for (size_t layer = 0; layer < meta->nlayer; layer++) {
w_attn_norm.push_back(
get_attn_norm(meta, weights, layer));
w_attn_qkv.push_back(
get_attn_qkv(meta, weights, layer, idev, ndev));
if (weights->attn_qkv_b != nullptr) {
b_attn_qkv.push_back(
get_attn_qkv_bias(meta, weights, layer, idev, ndev));
}
w_attn_out.push_back(
get_attn_o(meta, weights, layer, idev, ndev));
w_ffn_norm.push_back(
get_ffn_norm(meta, weights, layer));
w_ffn_gate_up.push_back(
get_ffn_gate_up(meta, weights, layer, idev, ndev));
w_ffn_down.push_back(
get_ffn_down(meta, weights, layer, idev, ndev));
}
*rsrc = DeviceResource{device,
dev_id,
handle,
get_in_embd(meta, weights),
get_out_norm(meta, weights),
get_out_embd(meta, weights),
get_sin_table(meta),
get_cos_table(meta),
w_attn_norm,
w_attn_qkv,
b_attn_qkv,
w_attn_out,
w_ffn_norm,
w_ffn_gate_up,
w_ffn_down,
stream,
comm};
}
void inferDeviceBatch(const JiugeMeta &meta, const DeviceResource &rsrc,
uint32_t idev, uint32_t ndev,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
uint32_t *ans,
float temperature, uint32_t topk, float topp) {
auto nlayer = meta.nlayer;
auto nkvh = meta.nkvh / ndev;
auto nh = meta.nh / ndev;
// auto dctx = meta.dctx;
auto dh = meta.dh;
auto d = meta.d;
auto dt_logits = meta.dt_logits;
auto di = meta.di / ndev;
auto dvoc = meta.dvoc;
auto stream = rsrc.stream;
// Allocate buffers
auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, stream);
auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, stream);
auto qkv_buf = Tensor::buffer(dt_logits, {ntok, (nh + nkvh * 2) * dh}, stream);
auto gate_up_buf = Tensor::buffer(dt_logits, {ntok, 2 * di}, stream);
auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, stream);
auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, stream);
auto result_buf = Tensor::buffer(INFINI_DTYPE_U32, {nreq}, stream);
auto result_cpu = std::vector<uint32_t>(nreq);
// Prepare inputs
auto batch_pos_ids = std::vector<uint32_t>(ntok);
size_t req_start = 0;
for (uint32_t req = 0; req < nreq; req++) {
for (uint32_t i = 0; i < req_lens[req]; i++) {
batch_pos_ids[req_start + i] = req_pos[req] + i;
}
req_start += req_lens[req];
}
std::shared_ptr<Tensor> pos_ids_buf;
if (rsrc.device == INFINI_DEVICE_CPU) {
pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
} else {
pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, stream);
RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
INFINIRT_MEMCPY_H2D, stream));
}
for (uint32_t i = 0; i < ntok; i++) {
RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
rsrc.w_in_embd->data(tokens[i] * d),
dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
}
// Prepare operators and workspace
void *workspace;
size_t workspace_size = 0, temp_size = 0;
// attn & mlp rmsnorm
infiniopRMSNormDescriptor_t desc_norm;
RUN_INFINI(infiniopCreateRMSNormDescriptor(
rsrc.handle, &desc_norm, logits_in->desc()->get(),
logits_out->desc()->get(), rsrc.w_attn_norm[0]->desc()->get(),
meta.epsilon));
RUN_INFINI(infiniopGetRMSNormWorkspaceSize(desc_norm, &workspace_size));
workspace_size = std::max(workspace_size, temp_size);
// Attention
infiniopGemmDescriptor_t desc_attn_qkv, desc_attn_o;
RUN_INFINI(infiniopCreateGemmDescriptor(
rsrc.handle, &desc_attn_qkv, qkv_buf->desc()->get(),
logits_in->desc()->get(), rsrc.w_attn_qkv[0]->desc()->get()));
RUN_INFINI(infiniopCreateGemmDescriptor(
rsrc.handle, &desc_attn_o, logits_in->desc()->get(),
o_buf->desc()->get(), rsrc.w_attn_out[0]->desc()->get()));
RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_attn_qkv, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_attn_o, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
infiniopRoPEDescriptor_t desc_rope_q, desc_rope_k;
qkv_buf->dim_split(1, {nh + nkvh * 2, dh}); // (ntok, nh + 2 * nkvh, dh)
auto qkv_buf_q = qkv_buf->slice(1, 0, nh);
auto qkv_buf_k = qkv_buf->slice(1, nh, nkvh);
RUN_INFINI(infiniopCreateRoPEDescriptor(
rsrc.handle, &desc_rope_q, qkv_buf_q->desc()->get(), qkv_buf_q->desc()->get(),
pos_ids_buf->desc()->get(), rsrc.sin_table->desc()->get(),
rsrc.cos_table->desc()->get()));
RUN_INFINI(infiniopGetRoPEWorkspaceSize(desc_rope_q, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
RUN_INFINI(infiniopCreateRoPEDescriptor(
rsrc.handle, &desc_rope_k, qkv_buf_k->desc()->get(), qkv_buf_k->desc()->get(),
pos_ids_buf->desc()->get(), rsrc.sin_table->desc()->get(),
rsrc.cos_table->desc()->get()));
RUN_INFINI(infiniopGetRoPEWorkspaceSize(desc_rope_k, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
// attention inner
auto desc_attns = std::vector<infiniopAttentionDescriptor_t>(nreq);
size_t token_offset = 0;
o_buf->dim_split(1, {nh, dh});
for (uint32_t req = 0; req < nreq; req++) {
auto past_len = req_pos[req];
auto seq_len = req_lens[req];
auto o = o_buf->slice({{0, token_offset, seq_len}});
auto q = qkv_buf->slice({{0, token_offset, seq_len}, {1, 0, nh}})
->permute({1, 0, 2});
auto k = qkv_buf->slice({{0, token_offset, seq_len}, {1, nh, nkvh}})
->permute({1, 0, 2});
auto v = qkv_buf->slice({{0, token_offset, seq_len}, {1, nh + nkvh, nkvh}})
->permute({1, 0, 2});
auto k_cache = kv_caches[req]->k[idev][0];
auto v_cache = kv_caches[req]->v[idev][0];
RUN_INFINI(infiniopCreateAttentionDescriptor(
rsrc.handle, &desc_attns[req], o->desc()->get(), q->desc()->get(),
k->desc()->get(), v->desc()->get(), k_cache->desc()->get(),
v_cache->desc()->get(), past_len));
RUN_INFINI(
infiniopGetAttentionWorkspaceSize(desc_attns[req], &temp_size));
workspace_size = std::max(workspace_size, temp_size);
token_offset += seq_len;
}
// MLP descriptors
infiniopGemmDescriptor_t desc_ffn_gate_up, desc_ffn_down;
infiniopSwiGLUDescriptor_t desc_swiglu;
RUN_INFINI(infiniopCreateGemmDescriptor(
rsrc.handle, &desc_ffn_gate_up, gate_up_buf->desc()->get(),
logits_out->desc()->get(), rsrc.w_ffn_gate_up[0]->desc()->get()));
RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_ffn_gate_up, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
auto gate_buf = gate_up_buf->slice(1, 0, di);
auto up_buf = gate_up_buf->slice(1, di, di);
RUN_INFINI(infiniopCreateSwiGLUDescriptor(
rsrc.handle, &desc_swiglu, logits_out->desc()->get(), up_buf->desc()->get(), gate_buf->desc()->get()));
RUN_INFINI(infiniopGetSwiGLUWorkspaceSize(desc_swiglu, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
RUN_INFINI(infiniopCreateGemmDescriptor(
rsrc.handle, &desc_ffn_down, logits_in->desc()->get(),
logits_out->desc()->get(), rsrc.w_ffn_down[0]->desc()->get()));
RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_ffn_down, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
// Output and sample
infiniopRMSNormDescriptor_t desc_norm_out;
RUN_INFINI(infiniopCreateRMSNormDescriptor(
rsrc.handle, &desc_norm_out, logits_out->slice(0, 0, 1)->desc()->get(),
logits_out->slice(0, 0, 1)->desc()->get(),
rsrc.w_out_norm->desc()->get(), meta.epsilon));
RUN_INFINI(infiniopGetRMSNormWorkspaceSize(desc_norm_out, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
infiniopGemmDescriptor_t desc_out_embd;
RUN_INFINI(infiniopCreateGemmDescriptor(
rsrc.handle, &desc_out_embd, prob_buf->desc()->get(),
logits_out->slice(0, 0, nreq)->desc()->get(),
rsrc.w_out_embd->desc()->get()));
RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_out_embd, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
infiniopRandomSampleDescriptor_t desc_sample;
RUN_INFINI(infiniopCreateRandomSampleDescriptor(
rsrc.handle, &desc_sample,
TensorDesc::create(INFINI_DTYPE_U64, {1}, {1})->get(),
TensorDesc::create(dt_logits, {dvoc}, {1})->get()));
RUN_INFINI(infiniopGetRandomSampleWorkspaceSize(desc_sample, &temp_size));
workspace_size = std::max(workspace_size, temp_size);
// Allocate workspace
RUN_INFINI(infinirtMallocAsync(&workspace, workspace_size, stream));
for (uint32_t layer = 0; layer < nlayer; layer++) {
// 1. Attention
// rms norm
RUN_INFINI(infiniopRMSNorm(
desc_norm, workspace, workspace_size,
logits_out->data(), logits_in->data(),
rsrc.w_attn_norm[layer]->data(), stream));
// qkv_proj
RUN_INFINI(infiniopGemm(
desc_attn_qkv, workspace, workspace_size,
qkv_buf->data(), logits_out->data(),
rsrc.w_attn_qkv[layer]->data(), 1.0, 0.0, stream));
// rope
RUN_INFINI(infiniopRoPE(
desc_rope_q, workspace, workspace_size,
qkv_buf->data(), qkv_buf->data(),
pos_ids_buf->data(),
rsrc.sin_table->data(),
rsrc.cos_table->data(), stream));
RUN_INFINI(infiniopRoPE(
desc_rope_k, workspace, workspace_size,
qkv_buf->data(nh * dh), qkv_buf->data(nh * dh),
pos_ids_buf->data(),
rsrc.sin_table->data(),
rsrc.cos_table->data(),
stream));
size_t token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto seq_len = req_lens[req];
// self attention
RUN_INFINI(infiniopAttention(
desc_attns[req], workspace, workspace_size,
o_buf->data(token_offset * nh * dh),
qkv_buf->data(token_offset * (nh + nkvh * 2) * dh),
qkv_buf->data(token_offset * (nh + nkvh * 2) * dh + nh * dh),
qkv_buf->data(token_offset * (nh + nkvh * 2) * dh + (nh + nkvh) * dh),
kv_caches[req]->k[idev][layer]->data(),
kv_caches[req]->v[idev][layer]->data(),
stream));
token_offset += seq_len;
}
// o_proj
RUN_INFINI(infiniopGemm(
desc_attn_o, workspace, workspace_size,
logits_in->data(), o_buf->data(),
rsrc.w_attn_out[layer]->data(), 1.0, idev == 0 ? 1.0 : 0.0, stream)); // only rank 0 adds residual
// All_reduce if distributed
if (rsrc.comm != nullptr) {
RUN_INFINI(infinicclAllReduce(
logits_in->data(), logits_in->data(), ntok * d, dt_logits,
INFINICCL_SUM, rsrc.comm, stream));
}
// 2. FFN
// rms_norm
RUN_INFINI(infiniopRMSNorm(
desc_norm, workspace, workspace_size,
logits_out->data(), logits_in->data(),
rsrc.w_ffn_norm[layer]->data(), stream));
// mlp
RUN_INFINI(infiniopGemm(
desc_ffn_gate_up, workspace, workspace_size,
gate_up_buf->data(), logits_out->data(), rsrc.w_ffn_gate_up[layer]->data(),
1.0, 0.0, stream));
RUN_INFINI(infiniopSwiGLU(
desc_swiglu, workspace, workspace_size,
logits_out->data(), up_buf->data(), gate_buf->data(), stream));
RUN_INFINI(infiniopGemm(
desc_ffn_down, workspace, workspace_size,
logits_in->data(), logits_out->data(),
rsrc.w_ffn_down[layer]->data(), 1.0, idev == 0 ? 1.0 : 0.0, stream)); // only rank 0 adds residual
// All_reduce if distributed
if (rsrc.comm != nullptr) {
RUN_INFINI(infinicclAllReduce(
logits_in->data(), logits_in->data(), ntok * d, dt_logits,
INFINICCL_SUM, rsrc.comm, stream));
}
}
// Sample and Output
uint64_t tmp;
if (idev == 0) {
size_t token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto seq_len = req_lens[req];
token_offset += seq_len;
RUN_INFINI(infiniopRMSNorm(
desc_norm_out, workspace, workspace_size,
logits_out->data(req * d),
logits_in->data((token_offset - 1) * d),
rsrc.w_out_norm->data(), stream));
}
RUN_INFINI(infiniopGemm(
desc_out_embd, workspace, workspace_size,
prob_buf->data(), logits_out->data(),
rsrc.w_out_embd->data(), 1.0, 0.0, stream));
std::random_device _rd;
std::mt19937 gen(_rd());
token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto seq_len = req_lens[req];
float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
RUN_INFINI(infiniopRandomSample(
desc_sample, workspace, workspace_size,
result_buf->data(req),
prob_buf->data(req * dvoc), random_val, topp,
topk, temperature, stream));
token_offset += seq_len;
}
RUN_INFINI(infinirtStreamSynchronize(stream));
RUN_INFINI(infinirtMemcpy(&tmp, result_buf->data(),
sizeof(uint64_t) * nreq, INFINIRT_MEMCPY_D2H));
for (uint32_t req = 0; req < nreq; req++) {
ans[req] = (uint32_t)result_cpu[req];
}
}
// Clean up
infiniopDestroyRMSNormDescriptor(desc_norm);
infiniopDestroyGemmDescriptor(desc_attn_qkv);
infiniopDestroyGemmDescriptor(desc_attn_o);
infiniopDestroyRoPEDescriptor(desc_rope_q);
infiniopDestroyRoPEDescriptor(desc_rope_k);
for (uint32_t req = 0; req < nreq; req++) {
infiniopDestroyAttentionDescriptor(desc_attns[req]);
}
infiniopDestroyRMSNormDescriptor(desc_norm_out);
infiniopDestroyGemmDescriptor(desc_out_embd);
infiniopDestroyRandomSampleDescriptor(desc_sample);
infinirtFree(workspace);
}
__C void
inferBatch(struct JiugeModel *model,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
uint32_t *ans,
float temperature, uint32_t topk, float topp) {
model->req.tokens = tokens;
model->req.ntok = ntok;
model->req.req_lens = req_lens;
model->req.nreq = nreq;
model->req.req_pos = req_pos;
model->req.kv_caches = kv_caches;
model->req.ans = ans;
model->req.temperature = temperature;
model->req.topk = topk;
model->req.topp = topp;
for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].proceed = true;
lock.unlock();
model->states[idev].cv.notify_one();
}
}
void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
while (true) {
std::unique_lock<std::mutex> lock(state.mtx);
state.cv.wait(lock, [&] { return state.proceed || state.exit_flag; });
if (state.exit_flag) {
break;
}
inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok, req.req_lens, req.nreq, req.req_pos, req.kv_caches, req.ans, req.temperature, req.topk, req.topp);
state.proceed = false;
lock.unlock();
}
infiniopDestroyHandle(rsrc->handle);
infinirtStreamDestroy(rsrc->stream);
infinicclCommDestroy(rsrc->comm);
}
JiugeModel::JiugeModel(const JiugeMeta *_meta, const JiugeWeights *weights, infiniDevice_t device, std::vector<int> device_ids) : meta(*_meta) {
int ndev = int(device_ids.size());
dev_ids = device_ids;
dev_resources = std::vector<DeviceResource>(ndev);
states = std::vector<InferState>(ndev);
threads.resize(ndev);
RUN_INFINI(infinirtInit());
auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
if (ndev > 1) {
RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
}
for (int i = 0; i < ndev; i++) {
threads[i] = std::thread(launchDevice, std::cref(meta), weights, &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
}
}
__C struct JiugeModel *
createJiugeModel(const JiugeMeta *meta,
const JiugeWeights *weights,
infiniDevice_t device,
int ndev,
const int *dev_ids) {
std::vector<int> device_ids(ndev);
std::copy(dev_ids, dev_ids + ndev, device_ids.begin());
JiugeModel *model = new JiugeModel(meta, weights, device, device_ids);
return model;
}
__C void destroyJiugeModel(struct JiugeModel *model) {
auto ndev = model->dev_resources.size();
for (size_t idev = 0; idev < ndev; idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].exit_flag = true;
lock.unlock();
model->states[idev].cv.notify_one();
}
for (size_t idev = 0; idev < ndev; idev++) {
model->threads[idev].join();
}
delete model;
}
\ No newline at end of file
#ifndef JIUGE_IMPL_H
#define JIUGE_IMPL_H
#include "infinicore_infer.h"
#include "../../tensor.hpp"
#include <condition_variable>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
struct DeviceResource {
// Device
infiniDevice_t device;
int device_id;
infiniopHandle_t handle;
// Weights
std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
cos_table;
std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
w_ffn_norm, w_ffn_gate_up, w_ffn_down;
// Streams
infinirtStream_t stream;
infinicclComm_t comm;
};
struct InferState {
std::mutex mtx;
std::condition_variable cv;
bool proceed = false;
bool exit_flag = false;
};
struct InferRequest {
const uint32_t *tokens;
uint32_t ntok;
const uint32_t *req_lens;
uint32_t nreq;
const uint32_t *req_pos;
struct KVCache **kv_caches;
uint32_t *ans;
float temperature;
uint32_t topk;
float topp;
};
struct JiugeModel {
JiugeMeta meta;
infiniDevice_t device;
std::vector<int> dev_ids;
std::vector<DeviceResource> dev_resources;
std::vector<InferState> states;
std::vector<std::thread> threads;
InferRequest req;
JiugeModel(const JiugeMeta *, const JiugeWeights *, infiniDevice_t device, std::vector<int> device_ids);
};
struct KVCache {
std::vector<std::vector<std::shared_ptr<Tensor>>> k, v;
};
#endif
#include "jiuge_impl.hpp"
__C struct KVCache *createKVCache(const JiugeModel *model) {
KVCache *cache = new KVCache();
auto ndev = model->dev_resources.size();
auto nkvh = model->meta.nkvh / ndev;
auto max_len = model->meta.dctx;
auto dh = model->meta.dh;
auto shape = std::vector<size_t>{nkvh, max_len, dh};
for (unsigned int idev = 0; idev < ndev; idev++) {
RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
auto kcache = std::vector<std::shared_ptr<Tensor>>();
auto vcache = std::vector<std::shared_ptr<Tensor>>();
for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
kcache.push_back(std::move(Tensor::buffer(model->meta.dt_mat, shape)));
vcache.push_back(std::move(Tensor::buffer(model->meta.dt_mat, shape)));
}
cache->k.push_back(kcache);
cache->v.push_back(vcache);
}
return cache;
}
__C struct KVCache *duplicateKVCache(const JiugeModel *model,
const KVCache *kv_cache,
unsigned int seq_len) {
auto new_kv_cache = createKVCache(model);
auto ndev = model->dev_resources.size();
for (unsigned int idev = 0; idev < ndev; idev++) {
RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
new_kv_cache->k[idev][layer]
->slice(1, 0, seq_len)
->copy_from(kv_cache->k[idev][layer]->slice(1, 0, seq_len),
model->dev_resources[idev].handle);
new_kv_cache->v[idev][layer]
->slice(1, 0, seq_len)
->copy_from(kv_cache->v[idev][layer]->slice(1, 0, seq_len),
model->dev_resources[idev].handle);
}
}
return new_kv_cache;
}
__C void dropKVCache(JiugeModel const *model, KVCache *kv_cache) {
auto ndev = model->dev_resources.size();
for (unsigned int idev = 0; idev < ndev; idev++) {
RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
kv_cache->k[idev][layer].reset();
kv_cache->v[idev][layer].reset();
}
}
delete kv_cache;
}
#ifndef JIUGE_WEIGHT_HPP
#define JIUGE_WEIGHT_HPP
#include "jiuge_impl.hpp"
#include <cmath>
inline std::shared_ptr<Tensor> get_in_embd(
JiugeMeta const *meta,
JiugeWeights const *w) {
auto shape = std::vector<size_t>({meta->dvoc, meta->d});
return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape);
}
inline std::shared_ptr<Tensor> get_out_norm(
JiugeMeta const *meta,
JiugeWeights const *w) {
auto shape = std::vector<size_t>({meta->d});
return Tensor::weight((char *)w->output_norm, meta->dt_norm, shape);
}
inline std::shared_ptr<Tensor> get_out_embd(
JiugeMeta const *meta,
JiugeWeights const *w) {
auto shape = std::vector<size_t>({meta->dvoc, meta->d});
return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape)
->permute({1, 0});
}
inline std::shared_ptr<Tensor> get_attn_norm(
JiugeMeta const *meta,
JiugeWeights const *w,
size_t layer) {
auto shape = std::vector<size_t>({meta->d});
return Tensor::weight((char *)(w->attn_norm[layer]), meta->dt_norm, shape);
}
inline std::shared_ptr<Tensor> get_attn_qkv(
JiugeMeta const *meta,
JiugeWeights const *w,
size_t layer, size_t idev, size_t ndev) {
auto nkvh = meta->nkvh;
auto nh = meta->nh;
auto dh = meta->dh;
auto d = meta->d;
size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * d * dsize(meta->dt_mat);
auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh, d});
return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, meta->dt_mat, shape)
->permute({1, 0});
}
inline std::shared_ptr<Tensor> get_attn_qkv_bias(
JiugeMeta const *meta,
JiugeWeights const *w,
size_t layer, size_t idev, size_t ndev) {
auto nkvh = meta->nkvh;
auto nh = meta->nh;
auto dh = meta->dh;
size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * dsize(meta->dt_mat);
auto shape = std::vector<size_t>({1, (nh + 2 * nkvh) / ndev * dh});
return Tensor::weight((char *)(w->attn_qkv_b[layer]) + offset, meta->dt_mat, shape);
}
inline std::shared_ptr<Tensor> get_attn_o(JiugeMeta const *meta,
JiugeWeights const *w, size_t layer,
size_t idev, size_t ndev) {
auto nh = meta->nh;
auto dh = meta->dh;
auto d = meta->d;
size_t offset = idev * d * (nh / ndev * dh) * dsize(meta->dt_mat);
auto shape = std::vector<size_t>({d, nh / ndev * dh});
return Tensor::weight((char *)(w->attn_o[layer]) + offset, meta->dt_mat, shape)
->permute({1, 0});
}
inline std::shared_ptr<Tensor> get_ffn_norm(
JiugeMeta const *meta,
JiugeWeights const *w,
size_t layer) {
auto shape = std::vector<size_t>({meta->d});
return Tensor::weight((char *)(w->ffn_norm[layer]), meta->dt_norm, shape);
}
inline std::shared_ptr<Tensor> get_ffn_gate_up(
JiugeMeta const *meta,
JiugeWeights const *w,
size_t layer, size_t idev, size_t ndev) {
auto di = meta->di;
auto d = meta->d;
size_t offset = idev * (2 * di / ndev) * d * dsize(meta->dt_mat);
auto shape = std::vector<size_t>({2 * di / ndev, d});
return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset,
meta->dt_mat, shape)
->permute({1, 0});
}
inline std::shared_ptr<Tensor> get_ffn_down(
JiugeMeta const *meta,
JiugeWeights const *w,
size_t layer, size_t idev, size_t ndev) {
auto di = meta->di;
auto d = meta->d;
size_t offset = idev * d * (di / ndev) * dsize(meta->dt_mat);
auto shape = std::vector<size_t>({d, di / ndev});
return Tensor::weight((char *)(w->ffn_down[layer]) + offset, meta->dt_mat, shape)
->permute({1, 0});
}
inline std::shared_ptr<Tensor> get_sin_table(JiugeMeta const *meta) {
float *table = (float *)std::malloc(meta->dctx * meta->dh * sizeof(float));
auto half_dh = meta->dh / 2;
for (size_t i = 0; i < meta->dctx; i++) {
for (size_t j = 0; j < half_dh; j++) {
float _sin = std::sin(
static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
table[i * meta->dh + 2 * j] = _sin;
table[i * meta->dh + 2 * j + 1] = _sin;
}
}
auto shape = std::vector<size_t>({meta->dctx, meta->dh});
auto tensor = Tensor::weight(table, meta->dt_logits, shape);
std::free(table);
return tensor;
}
inline std::shared_ptr<Tensor> get_cos_table(JiugeMeta const *meta) {
float *table = (float *)std::malloc(meta->dctx * meta->dh * sizeof(float));
auto half_dh = meta->dh / 2;
for (size_t i = 0; i < meta->dctx; i++) {
for (size_t j = 0; j < half_dh; j++) {
float _cos = std::cos(
static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
table[i * meta->dh + 2 * j] = _cos;
table[i * meta->dh + 2 * j + 1] = _cos;
}
}
auto shape = std::vector<size_t>({meta->dctx, meta->dh});
auto tensor = Tensor::weight(table, meta->dt_logits, shape);
std::free(table);
return tensor;
}
#endif
#ifndef INFER_TENSOR_H
#define INFER_TENSOR_H
#include "infinicore_infer.h"
#include "utils.hpp"
#include <memory>
#include <string>
#include <vector>
struct Storage {
void *memory;
size_t size;
infiniDevice_t device_type;
int device_id;
static std::shared_ptr<Storage> create(size_t size);
static std::shared_ptr<Storage> createAsync(size_t size, infinirtStream_t stream = nullptr);
static std::shared_ptr<Storage> createHost(size_t size);
~Storage();
};
struct SliceParams {
size_t dim;
size_t start;
size_t len;
};
class TensorDesc {
private:
infiniopTensorDescriptor_t _desc;
public:
static std::shared_ptr<TensorDesc>
create(infiniDtype_t dtype, const std::vector<size_t> &shape,
const std::vector<ptrdiff_t> &strides);
infiniopTensorDescriptor_t get() const { return _desc; };
~TensorDesc();
};
class Tensor : public std::enable_shared_from_this<Tensor> {
private:
infiniDtype_t _dtype;
std::vector<size_t> _shape;
std::vector<ptrdiff_t> _strides;
void *_data;
ptrdiff_t _offset;
size_t _size;
std::shared_ptr<Storage> storage;
infiniopTensorDescriptor_t _desc;
void *data_impl(ptrdiff_t offset) const;
std::shared_ptr<Tensor>
slice_impl(const std::vector<SliceParams> &slices) const;
public:
static std::shared_ptr<Tensor> buffer(infiniDtype_t dtype,
const std::vector<size_t> &shape,
infinirtStream_t stream = nullptr);
static std::shared_ptr<Tensor> weight(void *host_data,
infiniDtype_t dtype,
const std::vector<size_t> &shape);
std::shared_ptr<Tensor> slice(size_t dim, size_t start, size_t len);
std::shared_ptr<Tensor const> slice(size_t dim, size_t start,
size_t len) const;
std::shared_ptr<Tensor> slice(const std::vector<SliceParams> &slices);
std::shared_ptr<Tensor const>
slice(const std::vector<SliceParams> &slices) const;
std::shared_ptr<Tensor> dim_merge(size_t dim_start, size_t dim_end);
std::shared_ptr<Tensor> dim_split(size_t dim,
const std::vector<size_t> &dims);
std::shared_ptr<Tensor> permute(const std::vector<size_t> &order);
void *data(ptrdiff_t offset = 0);
void const *data(ptrdiff_t offset = 0) const;
void copy_from(std::shared_ptr<Tensor const> src, infiniopHandle_t handle,
infinirtStream_t stream = nullptr);
const std::vector<size_t> &shape() const;
const std::vector<ptrdiff_t> &strides() const;
size_t ndim() const;
infiniDtype_t dtype() const;
std::shared_ptr<TensorDesc> desc() const;
size_t byte_size() const;
ptrdiff_t data_offset() const;
infiniDevice_t device_type() const;
int device_id() const;
bool is_contigous() const;
void debug(const std::string &filename) const;
void debug() const;
~Tensor();
};
inline size_t dsize(infiniDtype_t dtype) {
switch (dtype) {
case INFINI_DTYPE_INVALID:
return 0;
case INFINI_DTYPE_BYTE:
return 1;
case INFINI_DTYPE_BOOL:
return 1;
case INFINI_DTYPE_I8:
return 1;
case INFINI_DTYPE_I16:
return 2;
case INFINI_DTYPE_I32:
return 4;
case INFINI_DTYPE_I64:
return 8;
case INFINI_DTYPE_U8:
return 1;
case INFINI_DTYPE_U16:
return 2;
case INFINI_DTYPE_U32:
return 4;
case INFINI_DTYPE_U64:
return 8;
case INFINI_DTYPE_F8:
return 1;
case INFINI_DTYPE_F16:
return 2;
case INFINI_DTYPE_F32:
return 4;
case INFINI_DTYPE_F64:
return 8;
case INFINI_DTYPE_C16:
return 2;
case INFINI_DTYPE_C32:
return 4;
case INFINI_DTYPE_C64:
return 8;
case INFINI_DTYPE_C128:
return 16;
case INFINI_DTYPE_BF16:
return 2;
default:
return 0;
}
}
#endif
#include "../tensor.hpp"
std::shared_ptr<Storage> Storage::create(size_t size) {
auto storage = std::make_shared<Storage>();
RUN_INFINI(infinirtMalloc(&storage->memory, size));
storage->size = size;
RUN_INFINI(infinirtGetDevice(&storage->device_type, &storage->device_id));
return storage;
}
std::shared_ptr<Storage> Storage::createAsync(size_t size, infinirtStream_t stream) {
auto storage = std::make_shared<Storage>();
RUN_INFINI(infinirtMallocAsync(&storage->memory, size, stream));
storage->size = size;
RUN_INFINI(infinirtGetDevice(&storage->device_type, &storage->device_id));
return storage;
}
std::shared_ptr<Storage> Storage::createHost(size_t size) {
auto storage = std::make_shared<Storage>();
RUN_INFINI(infinirtMallocHost(&storage->memory, size));
storage->size = size;
storage->device_type = INFINI_DEVICE_CPU;
storage->device_id = 0;
return storage;
}
Storage::~Storage() {
if (device_type == INFINI_DEVICE_CPU) {
RUN_INFINI(infinirtFreeHost(memory));
} else {
RUN_INFINI(infinirtFree(memory));
}
}
#include "../tensor.hpp"
#include "../utils.hpp"
#include <fstream>
#include <iostream>
#include <numeric>
std::shared_ptr<TensorDesc>
TensorDesc::create(infiniDtype_t dtype, const std::vector<size_t> &shape,
const std::vector<ptrdiff_t> &strides) {
auto desc = std::make_shared<TensorDesc>();
infiniopCreateTensorDescriptor(&desc->_desc, shape.size(), shape.data(),
strides.data(), dtype);
return desc;
}
TensorDesc::~TensorDesc() {
infiniopDestroyTensorDescriptor(this->_desc);
}
const std::vector<size_t> &Tensor::shape() const { return this->_shape; }
const std::vector<ptrdiff_t> &Tensor::strides() const { return this->_strides; }
size_t Tensor::ndim() const { return this->_shape.size(); }
infiniDtype_t Tensor::dtype() const { return this->_dtype; }
size_t Tensor::byte_size() const { return this->_size; }
infiniDevice_t Tensor::device_type() const { return this->storage->device_type; }
int Tensor::device_id() const { return this->storage->device_id; }
Tensor::~Tensor() {}
ptrdiff_t Tensor::data_offset() const {
return (char *)(this->_data) - (char *)(this->storage->memory);
}
std::shared_ptr<TensorDesc> Tensor::desc() const { return TensorDesc::create(this->_dtype, this->_shape, this->_strides); }
std::shared_ptr<Tensor> Tensor::buffer(infiniDtype_t dtype,
const std::vector<size_t> &shape,
infinirtStream_t stream) {
std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
tensor->_dtype = dtype;
auto ndim = shape.size();
if (shape.empty()) {
tensor->_shape = std::vector<size_t>{1};
ndim = 1;
} else {
tensor->_shape = std::vector<size_t>(shape);
}
size_t size = std::accumulate(shape.begin(), shape.end(), dsize(dtype), std::multiplies<size_t>());
auto strides = std::vector<ptrdiff_t>(ndim);
strides[ndim - 1] = 1;
for (int i = ndim - 2; i >= 0; i--) {
strides[i] = strides[i + 1] * shape[i + 1];
}
tensor->_strides = strides;
tensor->storage = Storage::createAsync(size, stream);
tensor->_size = size;
tensor->_data = tensor->storage->memory;
infiniopCreateTensorDescriptor(&tensor->_desc, ndim, tensor->_shape.data(),
strides.data(), dtype);
tensor->_offset = 0;
return tensor;
}
std::shared_ptr<Tensor> Tensor::weight(void *data, infiniDtype_t dtype,
const std::vector<size_t> &shape) {
std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
;
tensor->_dtype = dtype;
auto ndim = shape.size();
if (shape.empty()) {
tensor->_shape = std::vector<size_t>{1};
ndim = 1;
} else {
tensor->_shape = std::vector<size_t>(shape);
}
size_t size = std::accumulate(shape.begin(), shape.end(), dsize(dtype), std::multiplies<size_t>());
auto strides = std::vector<ptrdiff_t>(ndim);
strides[ndim - 1] = 1;
for (int i = ndim - 2; i >= 0; i--) {
strides[i] = strides[i + 1] * shape[i + 1];
}
tensor->_strides = strides;
tensor->storage = Storage::create(size);
RUN_INFINI(infinirtMemcpy(tensor->storage->memory,
data, size, INFINIRT_MEMCPY_H2D));
tensor->_data = tensor->storage->memory;
tensor->_size = size;
infiniopCreateTensorDescriptor(&tensor->_desc, ndim, tensor->_shape.data(),
strides.data(), dtype);
tensor->_offset = 0;
return tensor;
}
void *Tensor::data_impl(ptrdiff_t offset) const {
ASSERT(offset * dsize(this->dtype()) < this->_size);
return (char *)(this->_data) + offset * dsize(this->dtype());
}
void *Tensor::data(ptrdiff_t offset) {
return this->data_impl(offset);
}
const void *Tensor::data(ptrdiff_t offset) const {
return this->data_impl(offset);
}
void Tensor::copy_from(std::shared_ptr<Tensor const> src,
infiniopHandle_t handle, infinirtStream_t stream) {
ASSERT_EQ(this->shape(), src->shape());
ASSERT_EQ(this->dtype(), src->dtype());
infiniopRearrangeDescriptor_t desc;
RUN_INFINI(infiniopCreateRearrangeDescriptor(
handle, &desc, this->desc()->get(), src->desc()->get()));
RUN_INFINI(infiniopRearrange(desc, this->data(), src->data(),
stream));
RUN_INFINI(infiniopDestroyRearrangeDescriptor(desc));
}
bool Tensor::is_contigous() const {
auto ndim = this->ndim();
auto shape = this->shape();
auto strides = std::vector<ptrdiff_t>(ndim);
strides[ndim - 1] = 1;
for (int i = ndim - 2; i >= 0; i--) {
strides[i] = strides[i + 1] * shape[i + 1];
}
ASSERT_EQ(strides.size(), this->_strides.size());
return std::equal(strides.begin(), strides.end(), this->_strides.begin());
}
template <typename T>
void print_data(T *data, const std::vector<size_t> &shape,
const std::vector<ptrdiff_t> &strides, size_t dim) {
if (dim == shape.size() - 1) {
for (size_t i = 0; i < shape[dim]; i++) {
std::cout << data[i] << " ";
}
std::cout << std::endl;
} else if (dim < shape.size() - 1) {
for (size_t i = 0; i < shape[dim]; i++) {
print_data(data + i * strides[dim], shape, strides, dim + 1);
std::cout << std::endl;
}
}
}
template <>
void print_data(uint16_t const *data, const std::vector<size_t> &shape,
const std::vector<ptrdiff_t> &strides, size_t dim) {
if (dim == shape.size() - 1) {
for (size_t i = 0; i < shape[dim]; i++) {
std::cout << f16_to_f32(data[i * strides[dim]]) << " ";
}
} else if (dim < shape.size() - 1) {
for (size_t i = 0; i < shape[dim]; i++) {
print_data(data + i * strides[dim], shape, strides, dim + 1);
std::cout << std::endl;
}
}
}
void Tensor::debug(const std::string &filename) const {
RUN_INFINI(
infinirtDeviceSynchronize());
std::cout << "Tensor: "
<< "shape[ ";
for (auto s : this->shape()) {
std::cout << s << " ";
}
std::cout << "] strides[ ";
for (auto s : this->strides()) {
std::cout << s << " ";
}
std::cout << "] dtype=" << this->dtype()
<< " device=" << this->device_type()
<< " device_id=" << this->device_id() << std::endl;
auto dtype = this->dtype();
void const *cpu_data;
if (this->device_type() != INFINI_DEVICE_CPU) {
void *cpu_memory = std::malloc(this->storage->size);
RUN_INFINI(infinirtMemcpy(cpu_memory, this->storage->memory,
this->storage->size, INFINIRT_MEMCPY_D2H));
cpu_data = cpu_memory;
} else {
cpu_data = this->data();
}
if (!filename.empty()) {
std::ofstream outFile(filename, std::ios::binary);
if (!outFile) {
std::cerr << "Error opening file for writing: " << filename << "\n";
return;
}
outFile.write(reinterpret_cast<const char *>(cpu_data), this->storage->size);
outFile.close();
std::cout << "Data written to file: " << filename << "\n";
return;
}
switch (dtype) {
case INFINI_DTYPE_F16:
print_data((uint16_t const *)((char const *)cpu_data + data_offset()),
this->shape(), this->strides(), 0);
break;
case INFINI_DTYPE_F32:
print_data((float const *)((char const *)cpu_data + data_offset()),
this->shape(), this->strides(), 0);
break;
case INFINI_DTYPE_U64:
print_data((uint64_t const *)((char const *)cpu_data + data_offset()),
this->shape(), this->strides(), 0);
break;
case INFINI_DTYPE_I64:
print_data((int64_t const *)((char const *)cpu_data + data_offset()),
this->shape(), this->strides(), 0);
break;
case INFINI_DTYPE_U32:
print_data((uint32_t const *)((char const *)cpu_data + data_offset()),
this->shape(), this->strides(), 0);
break;
case INFINI_DTYPE_I32:
print_data((int32_t const *)((char const *)cpu_data + data_offset()),
this->shape(), this->strides(), 0);
break;
default:
PANIC("Unsupported data type");
}
}
void Tensor::debug() const { this->debug(""); }
#include "../tensor.hpp"
#include "../utils.hpp"
#include <algorithm>
#include <numeric>
#include <vector>
std::shared_ptr<Tensor> Tensor::slice_impl(const std::vector<SliceParams> &slices) const {
std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
auto new_shape = std::vector<size_t>(this->_shape);
ptrdiff_t offset = 0;
for (const auto &slice : slices) {
ASSERT(slice.len > 0);
ASSERT(this->_shape[slice.dim] >= slice.start + slice.len);
new_shape[slice.dim] = slice.len;
offset += slice.start * this->_strides[slice.dim];
}
tensor->_dtype = this->_dtype;
tensor->_shape = new_shape;
tensor->_strides = std::vector<ptrdiff_t>(this->_strides);
tensor->_offset = offset * dsize(this->_dtype);
tensor->_data = static_cast<char *>(this->_data) + tensor->_offset;
tensor->_size = std::accumulate(new_shape.begin(), new_shape.end(),
dsize(this->_dtype), std::multiplies<size_t>());
tensor->storage = this->storage;
infiniopCreateTensorDescriptor(&tensor->_desc, tensor->_shape.size(), tensor->_shape.data(),
tensor->_strides.data(), tensor->_dtype);
return tensor;
}
std::shared_ptr<Tensor> Tensor::slice(size_t dim, size_t start, size_t len) {
return this->slice_impl({{dim, start, len}});
}
std::shared_ptr<Tensor const> Tensor::slice(size_t dim, size_t start, size_t len) const {
return this->slice_impl({{dim, start, len}});
}
std::shared_ptr<Tensor> Tensor::slice(const std::vector<SliceParams> &slices) {
return this->slice_impl(slices);
}
std::shared_ptr<Tensor const> Tensor::slice(const std::vector<SliceParams> &slices) const {
return this->slice_impl(slices);
}
std::shared_ptr<Tensor> Tensor::dim_merge(size_t dim_start, size_t dim_end) {
ASSERT(dim_start <= dim_end && dim_end < this->_shape.size());
if (dim_start == dim_end) {
return shared_from_this();
}
auto new_shape = std::vector<size_t>();
auto new_strides = std::vector<ptrdiff_t>();
for (size_t i = 0; i < dim_start; i++) {
new_shape.push_back(this->_shape[i]);
new_strides.push_back(this->_strides[i]);
}
for (size_t i = dim_start + 1; i <= dim_end; i++) {
ASSERT_EQ(this->_strides[i - 1], ptrdiff_t(this->_shape[i]) * this->_strides[i]);
}
new_shape.push_back(std::accumulate(this->_shape.begin() + dim_start, this->_shape.begin() + dim_end + 1, 1, std::multiplies<size_t>()));
new_strides.push_back(this->_strides[dim_end]);
for (size_t i = dim_end + 1; i < this->_shape.size(); i++) {
new_shape.push_back(this->_shape[i]);
new_strides.push_back(this->_strides[i]);
}
this->_shape = new_shape;
this->_strides = new_strides;
infiniopDestroyTensorDescriptor(this->_desc);
infiniopCreateTensorDescriptor(&this->_desc, this->_shape.size(), this->_shape.data(),
this->_strides.data(), this->_dtype);
return shared_from_this();
}
std::shared_ptr<Tensor> Tensor::dim_split(size_t dim, const std::vector<size_t> &dims) {
ASSERT_EQ(this->_shape[dim], std::accumulate(dims.begin(), dims.end(), size_t(1), std::multiplies<size_t>()));
auto new_shape = std::vector<size_t>();
auto new_strides = std::vector<ptrdiff_t>();
for (size_t i = 0; i < dim; i++) {
new_shape.push_back(this->_shape[i]);
new_strides.push_back(this->_strides[i]);
}
for (size_t i = 0; i < dims.size(); i++) {
new_shape.push_back(dims[i]);
new_strides.push_back(this->_strides[dim] * this->_shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<size_t>()));
}
for (size_t i = dim + 1; i < this->_shape.size(); i++) {
new_shape.push_back(this->_shape[i]);
new_strides.push_back(this->_strides[i]);
}
this->_shape = new_shape;
this->_strides = new_strides;
infiniopDestroyTensorDescriptor(this->_desc);
infiniopCreateTensorDescriptor(&this->_desc, this->_shape.size(), this->_shape.data(),
this->_strides.data(), this->_dtype);
return shared_from_this();
}
std::shared_ptr<Tensor> Tensor::permute(const std::vector<size_t> &order) {
ASSERT_EQ(this->_shape.size(), order.size());
auto new_shape = std::vector<size_t>(order.size());
auto new_strides = std::vector<ptrdiff_t>(order.size());
for (size_t i = 0; i < order.size(); i++) {
ASSERT(std::find(order.begin(), order.end(), i) != order.end());
new_shape[i] = this->_shape[order[i]];
new_strides[i] = this->_strides[order[i]];
}
this->_shape = new_shape;
this->_strides = new_strides;
infiniopDestroyTensorDescriptor(this->_desc);
infiniopCreateTensorDescriptor(&this->_desc, this->_shape.size(), this->_shape.data(),
this->_strides.data(), this->_dtype);
return shared_from_this();
}
\ No newline at end of file
#ifndef INFINICORE_INFER_UTILS_H
#define INFINICORE_INFER_UTILS_H
#include <infinicore.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
inline void assert_true(int expr, const char *msg, const char *file, int line) {
if (!expr) {
fprintf(stderr, "\033[31mAssertion failed:\033[0m %s at file %s, line %d\n", msg, file, line);
exit(EXIT_FAILURE);
}
}
#define ASSERT(expr) assert_true((expr), #expr " is false", __FILE__, __LINE__)
#define ASSERT_EQ(a, b) assert_true((a) == (b), #a " != " #b, __FILE__, __LINE__)
#define ASSERT_VALID_PTR(a) assert_true((a) != nullptr, #a " is nullptr", __FILE__, __LINE__)
#define PANIC(EXPR) \
printf("Error at %s:%d - %s\n", __FILE__, __LINE__, #EXPR); \
exit(EXIT_FAILURE)
#define RUN_INFINI(API) \
do { \
auto api_result_ = (API); \
if (api_result_ != INFINI_STATUS_SUCCESS) { \
std::cerr << "Error Code " << api_result_ << " in `" << #API << "`" \
<< " from " << __func__ \
<< " at " << __FILE__ << ":" << __LINE__ << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
inline float f16_to_f32(uint16_t h) {
uint32_t sign = (h & 0x8000) << 16; // Extract the sign bit
int32_t exponent = (h >> 10) & 0x1F; // Extract the exponent
uint32_t mantissa = h & 0x3FF; // Extract the mantissa (fraction part)
if (exponent == 31) { // Special case for Inf and NaN
if (mantissa != 0) {
// NaN: Set float32 NaN
uint32_t f32 = sign | 0x7F800000 | (mantissa << 13);
return *(float *)&f32;
} else {
// Infinity
uint32_t f32 = sign | 0x7F800000;
return *(float *)&f32;
}
} else if (exponent == 0) { // Subnormal float16 or zero
if (mantissa == 0) {
// Zero (positive or negative)
uint32_t f32 = sign; // Just return signed zero
return *(float *)&f32;
} else {
// Subnormal: Convert to normalized float32
exponent = -14; // Set exponent for subnormal numbers
while ((mantissa & 0x400) == 0) { // Normalize mantissa
mantissa <<= 1;
exponent--;
}
mantissa &= 0x3FF; // Clear the leading 1 bit
uint32_t f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
return *(float *)&f32;
}
} else {
// Normalized float16
uint32_t f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
return *(float *)&f32;
}
}
#endif
local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
target("infinicore_infer")
set_kind("shared")
add_includedirs(INFINI_ROOT.."/include")
add_linkdirs(INFINI_ROOT.."/lib")
add_links("infiniop", "infinirt", "infiniccl")
set_languages("cxx17")
set_warnings("all", "error")
add_files("src/models/*/*.cpp")
add_files("src/tensor/*.cpp")
add_includedirs("include")
set_installdir(INFINI_ROOT)
add_installfiles("include/infinicore_infer.h", {prefixdir = "include"})
add_installfiles("include/infinicore_infer/*.h", {prefixdir = "include/infinicore_infer"})
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment