init

81fe2ba3 · PanZezhong · 81fe2ba3 · 81fe2ba3 · 81fe2ba3 · 81fe2ba3
Commit 81fe2ba3 authored May 14, 2025 by PanZezhong
15 changed files
--- a/.clang-format
+++ b/.clang-format
+---
+BasedOnStyle: LLVM
+IndentWidth: 4                        # 缩进宽度，LLVM 默认值为 2，改为 4
+AccessModifierOffset: -4              # public/protected/private 访问控制符相对成员的偏移，与 IndentWidth 配合，LLVM 默认值为 -2
+AlignOperands: AlignAfterOperator     # 双目运算符的行间对齐，LLVM 默认值为 Align，改为带符号一起换行
+BreakBeforeBinaryOperators: All       # 在双目运算符之前换行，LLVM 默认值为 None，改为换行时总是把双目运算符放在行首，包括赋值（=）
+ColumnLimit: 0                        # 列宽限制，LLVM 默认值为 80，改为不限制
+AllowShortBlocksOnASingleLine: Always # 是否允许短块（单个语句的块）不换行，LLVM 默认值为 Never，改为允许
+AllowShortLoopsOnASingleLine: true    # 是否允许短循环不换行，LLVM 默认值为 false，改为允许
+InsertBraces: true                    # 是否在 if/for/while/switch 等语句后插入大括号，LLVM 默认值为 false，改为允许
+BreakBeforeBraces: Custom             # 大括号换行配置，LLVM 默认值为 LLVM，改为自定义以使 BraceWrapping 生效
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterControlStatement: Never
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
--- a/.gitignore
+++ b/.gitignore
+# Xmake cache
+.xmake/
+build/
+# MacOS Cache
+.DS_Store
+# Vscode
+.vscode/
+# Python
+__pycache__/
+# Log
+*.log
+# Cache
+cache/
+# JSON
+*.json
+#GGUF
+*.gguf
--- a/include/infinicore_infer.h
+++ b/include/infinicore_infer.h
+#ifndef INFINICORE_INFER_H
+#define INFINICORE_INFER_H
+#include "infinicore_infer/models/jiuge.h"
+#endif /* INFINICORE_INFER_H */
--- a/include/infinicore_infer/models/jiuge.h
+++ b/include/infinicore_infer/models/jiuge.h
+#ifndef MODEL_JIUGE_H
+#define MODEL_JIUGE_H
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+#include <stdint.h>
+struct JiugeModel;
+typedef struct
+{
+    infiniDtype_t dt_logits, dt_norm, dt_mat;
+    size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
+    float epsilon, theta;
+    uint32_t end_token;
+} JiugeMeta;
+typedef struct
+{
+    size_t nlayer;
+    // [dvoc, d]
+    const void *input_embd;
+    // [d]
+    const void *output_norm;
+    // [dvoc, d]
+    const void *output_embd;
+    // nlayer * [d]
+    const void *const *attn_norm;
+    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
+    const void *const *attn_qkv;
+    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
+    const void *const *attn_qkv_b;
+    // nlayer * [ndev, d, nkvh / ndev * dh]
+    const void *const *attn_o;
+    // nlayer * [d]
+    const void *const *ffn_norm;
+    // nlayer * [ndev, 2 * di / ndev, d]
+    const void *const *ffn_gate_up;
+    // nlayer * [ndev, d, di / ndev]
+    const void *const *ffn_down;
+} JiugeWeights;
+//////////////////// APIs ///////////////////////
+/// @brief 创建模型
+/// @param device 协处理器种类
+/// @param ndev 协处理器数量
+/// @param dev_ids 协处理器编号，长度为 ndev
+__C __export struct JiugeModel *
+createJiugeModel(const JiugeMeta *,
+                 const JiugeWeights *,
+                 infiniDevice_t device,
+                 int ndev,
+                 const int *dev_ids);
+/// @brief 销毁模型
+__C __export void
+destroyJiugeModel(struct JiugeModel *);
+/// @brief 创建 KV Cache
+__C __export struct KVCache *
+createKVCache(const struct JiugeModel *);
+/// @brief 复制 KV Cache
+__C __export struct KVCache *
+duplicateKVCache(const struct JiugeModel *,
+                 const struct KVCache *, uint32_t seq_len);
+/// @brief 销毁 KV Cache
+__C __export void
+dropKVCache(const struct JiugeModel *,
+            struct KVCache *);
+/// @brief 文本生成
+/// @param tokens 输入 token
+/// @param ntok 输入 token 数量
+/// @param req_pos 每个请求的起始位置
+/// @param output 输出 token 地址
+/// @param max_step 输出 token 最大数量
+/// @param temperature 采样温度（0. 表示贪心采样）
+/// @param topk 采样 topk（1 表示贪心采样）
+/// @param topp 采样 topp
+__C __export void
+generate(struct JiugeModel *,
+         struct KVCache *,
+         const uint32_t *tokens, uint32_t ntok, uint32_t req_pos,
+         uint32_t *output, uint32_t max_step,
+         float temperature, uint32_t topk, float topp);
+/// @brief 批次推理一轮
+/// @param tokens 输入 token 地址
+/// @param ntok 输入 token 数量
+/// @param nreq 请求数量
+/// @param req_lens 每个请求的 token 数量
+/// @param req_pos 每个请求的起始位置
+/// @param kv_caches 每个请求的 KV Cache
+/// @param ans 输出 token 数组，每个请求一个输出，长度至少为nreq
+/// @param temperature 采样温度（0. 表示贪心采样）
+/// @param topk 采样 topk（1 表示贪心采样）
+/// @param topp 采样 topp
+__C __export void
+inferBatch(struct JiugeModel *,
+           const uint32_t *tokens, uint32_t ntok,
+           const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+           struct KVCache **kv_caches,
+           uint32_t *output,
+           float temperature, uint32_t topk, float topp);
+#endif
--- a/scripts/libinfinicore_infer.py
+++ b/scripts/libinfinicore_infer.py
+import ctypes
+from ctypes import c_uint, c_int, c_float, c_void_p, POINTER
+import os
+class DataType(ctypes.c_int):
+    INFINI_DTYPE_INVALID = 0
+    INFINI_DTYPE_BYTE = 1
+    INFINI_DTYPE_BOOL = 2
+    INFINI_DTYPE_I8 = 3
+    INFINI_DTYPE_I16 = 4
+    INFINI_DTYPE_I32 = 5
+    INFINI_DTYPE_I64 = 6
+    INFINI_DTYPE_U8 = 7
+    INFINI_DTYPE_U16 = 8
+    INFINI_DTYPE_U32 = 9
+    INFINI_DTYPE_U64 = 10
+    INFINI_DTYPE_F8 = 11
+    INFINI_DTYPE_F16 = 12
+    INFINI_DTYPE_F32 = 13
+    INFINI_DTYPE_F64 = 14
+    INFINI_DTYPE_C16 = 15
+    INFINI_DTYPE_C32 = 16
+    INFINI_DTYPE_C64 = 17
+    INFINI_DTYPE_C128 = 18
+    INFINI_DTYPE_BF16 = 19
+class DeviceType(ctypes.c_int):
+    DEVICE_TYPE_CPU = 0
+    DEVICE_TYPE_CUDA = 1
+    DEVICE_TYPE_CAMBRICON = 2
+    DEVICE_TYPE_ASCEND = 3
+    DEVICE_TYPE_METAX = 4
+    DEVICE_TYPE_MOORE = 5
+class JiugeMeta(ctypes.Structure):
+    _fields_ = [
+        ("dt_logits", DataType),
+        ("dt_norm", DataType),
+        ("dt_mat", DataType),
+        ("nlayer", c_uint),
+        ("d", c_uint),
+        ("nh", c_uint),
+        ("nkvh", c_uint),
+        ("dh", c_uint),
+        ("di", c_uint),
+        ("dctx", c_uint),
+        ("dvoc", c_uint),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_uint),
+    ]
+# Define the JiugeWeights struct
+class JiugeWeights(ctypes.Structure):
+    _fields_ = [
+        ("nlayer", c_uint),
+        ("input_embd", c_void_p),
+        ("output_norm", c_void_p),
+        ("output_embd", c_void_p),
+        ("attn_norm", POINTER(c_void_p)),
+        ("attn_qkv", POINTER(c_void_p)),
+        ("attn_qkv_b", POINTER(c_void_p)),
+        ("attn_o", POINTER(c_void_p)),
+        ("ffn_norm", POINTER(c_void_p)),
+        ("ffn_gate_up", POINTER(c_void_p)),
+        ("ffn_down", POINTER(c_void_p)),
+    ]
+class JiugeModel(ctypes.Structure):
+    pass
+class KVCache(ctypes.Structure):
+    pass
+def open_library():
+    lib_path = os.path.join(
+        os.environ.get("INFINI_ROOT"), "lib", "libinfinicore_infer.so"
+    )
+    lib = ctypes.CDLL(lib_path)
+    lib.create_JiugeModel.restype = POINTER(JiugeModel)
+    lib.create_JiugeModel.argtypes = [
+        POINTER(JiugeMeta),  # JiugeMeta const *
+        POINTER(JiugeWeights),  # JiugeWeights const *
+        DeviceType,  # DeviceType
+        c_int,  # int ndev
+        POINTER(c_int),  # int const *dev_ids
+    ]
+    lib.create_kv_cache.restype = POINTER(KVCache)
+    lib.drop_kv_cache.argtypes = [ctypes.POINTER(JiugeModel), POINTER(KVCache)]
+    lib.inferBatch.restype = None
+    lib.inferBatch.argtypes = [
+        ctypes.POINTER(JiugeModel),  # struct JiugeModel const *
+        POINTER(c_uint),  # unsigned int const *tokens
+        c_uint,  # unsigned int ntok
+        POINTER(c_uint),  # unsigned int const *req_lens
+        c_uint,  # unsigned int nreq
+        POINTER(c_uint),  # unsigned int const *req_pos
+        POINTER(POINTER(KVCache)),  # struct KVCache **kv_caches
+        POINTER(c_uint),  # unsigned int *output
+        c_float,  # float temperature
+        c_uint,  # unsigned int topk
+        c_float,  # float topp
+    ]
+    return lib
--- a/src/models/jiuge/jiuge.cpp
+++ b/src/models/jiuge/jiuge.cpp
+#include "jiuge_impl.hpp"
+#include "jiuge_weight.hpp"
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+#include "infinicore_infer.h"
+#include <random>
+#include <thread>
+#include <vector>
+void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
+                          const JiugeWeights *weights,
+                          infiniDevice_t device, int idev,
+                          int ndev, int dev_id,
+                          infinicclComm_t comm) {
+    RUN_INFINI(infinirtSetDevice(device, dev_id));
+    infiniopHandle_t handle;
+    infiniopCreateHandle(&handle);
+    infinirtStream_t stream;
+    infinirtStreamCreate(&stream);
+    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
+        w_ffn_norm, w_ffn_gate_up, w_ffn_down;
+    for (size_t layer = 0; layer < meta->nlayer; layer++) {
+        w_attn_norm.push_back(
+            get_attn_norm(meta, weights, layer));
+        w_attn_qkv.push_back(
+            get_attn_qkv(meta, weights, layer, idev, ndev));
+        if (weights->attn_qkv_b != nullptr) {
+            b_attn_qkv.push_back(
+                get_attn_qkv_bias(meta, weights, layer, idev, ndev));
+        }
+        w_attn_out.push_back(
+            get_attn_o(meta, weights, layer, idev, ndev));
+        w_ffn_norm.push_back(
+            get_ffn_norm(meta, weights, layer));
+        w_ffn_gate_up.push_back(
+            get_ffn_gate_up(meta, weights, layer, idev, ndev));
+        w_ffn_down.push_back(
+            get_ffn_down(meta, weights, layer, idev, ndev));
+    }
+    *rsrc = DeviceResource{device,
+                           dev_id,
+                           handle,
+                           get_in_embd(meta, weights),
+                           get_out_norm(meta, weights),
+                           get_out_embd(meta, weights),
+                           get_sin_table(meta),
+                           get_cos_table(meta),
+                           w_attn_norm,
+                           w_attn_qkv,
+                           b_attn_qkv,
+                           w_attn_out,
+                           w_ffn_norm,
+                           w_ffn_gate_up,
+                           w_ffn_down,
+                           stream,
+                           comm};
+}
+void inferDeviceBatch(const JiugeMeta &meta, const DeviceResource &rsrc,
+                      uint32_t idev, uint32_t ndev,
+                      const uint32_t *tokens, uint32_t ntok,
+                      const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                      struct KVCache **kv_caches,
+                      uint32_t *ans,
+                      float temperature, uint32_t topk, float topp) {
+    auto nlayer = meta.nlayer;
+    auto nkvh = meta.nkvh / ndev;
+    auto nh = meta.nh / ndev;
+    // auto dctx = meta.dctx;
+    auto dh = meta.dh;
+    auto d = meta.d;
+    auto dt_logits = meta.dt_logits;
+    auto di = meta.di / ndev;
+    auto dvoc = meta.dvoc;
+    auto stream = rsrc.stream;
+    // Allocate buffers
+    auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, stream);
+    auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, stream);
+    auto qkv_buf = Tensor::buffer(dt_logits, {ntok, (nh + nkvh * 2) * dh}, stream);
+    auto gate_up_buf = Tensor::buffer(dt_logits, {ntok, 2 * di}, stream);
+    auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, stream);
+    auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, stream);
+    auto result_buf = Tensor::buffer(INFINI_DTYPE_U32, {nreq}, stream);
+    auto result_cpu = std::vector<uint32_t>(nreq);
+    // Prepare inputs
+    auto batch_pos_ids = std::vector<uint32_t>(ntok);
+    size_t req_start = 0;
+    for (uint32_t req = 0; req < nreq; req++) {
+        for (uint32_t i = 0; i < req_lens[req]; i++) {
+            batch_pos_ids[req_start + i] = req_pos[req] + i;
+        }
+        req_start += req_lens[req];
+    }
+    std::shared_ptr<Tensor> pos_ids_buf;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
+    } else {
+        pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, stream);
+        RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
+                                       INFINIRT_MEMCPY_H2D, stream));
+    }
+    for (uint32_t i = 0; i < ntok; i++) {
+        RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
+                                       rsrc.w_in_embd->data(tokens[i] * d),
+                                       dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
+    }
+    // Prepare operators and workspace
+    void *workspace;
+    size_t workspace_size = 0, temp_size = 0;
+    // attn & mlp rmsnorm
+    infiniopRMSNormDescriptor_t desc_norm;
+    RUN_INFINI(infiniopCreateRMSNormDescriptor(
+        rsrc.handle, &desc_norm, logits_in->desc()->get(),
+        logits_out->desc()->get(), rsrc.w_attn_norm[0]->desc()->get(),
+        meta.epsilon));
+    RUN_INFINI(infiniopGetRMSNormWorkspaceSize(desc_norm, &workspace_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    // Attention
+    infiniopGemmDescriptor_t desc_attn_qkv, desc_attn_o;
+    RUN_INFINI(infiniopCreateGemmDescriptor(
+        rsrc.handle, &desc_attn_qkv, qkv_buf->desc()->get(),
+        logits_in->desc()->get(), rsrc.w_attn_qkv[0]->desc()->get()));
+    RUN_INFINI(infiniopCreateGemmDescriptor(
+        rsrc.handle, &desc_attn_o, logits_in->desc()->get(),
+        o_buf->desc()->get(), rsrc.w_attn_out[0]->desc()->get()));
+    RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_attn_qkv, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_attn_o, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    infiniopRoPEDescriptor_t desc_rope_q, desc_rope_k;
+    qkv_buf->dim_split(1, {nh + nkvh * 2, dh}); // (ntok, nh + 2 * nkvh, dh)
+    auto qkv_buf_q = qkv_buf->slice(1, 0, nh);
+    auto qkv_buf_k = qkv_buf->slice(1, nh, nkvh);
+    RUN_INFINI(infiniopCreateRoPEDescriptor(
+        rsrc.handle, &desc_rope_q, qkv_buf_q->desc()->get(), qkv_buf_q->desc()->get(),
+        pos_ids_buf->desc()->get(), rsrc.sin_table->desc()->get(),
+        rsrc.cos_table->desc()->get()));
+    RUN_INFINI(infiniopGetRoPEWorkspaceSize(desc_rope_q, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    RUN_INFINI(infiniopCreateRoPEDescriptor(
+        rsrc.handle, &desc_rope_k, qkv_buf_k->desc()->get(), qkv_buf_k->desc()->get(),
+        pos_ids_buf->desc()->get(), rsrc.sin_table->desc()->get(),
+        rsrc.cos_table->desc()->get()));
+    RUN_INFINI(infiniopGetRoPEWorkspaceSize(desc_rope_k, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    // attention inner
+    auto desc_attns = std::vector<infiniopAttentionDescriptor_t>(nreq);
+    size_t token_offset = 0;
+    o_buf->dim_split(1, {nh, dh});
+    for (uint32_t req = 0; req < nreq; req++) {
+        auto past_len = req_pos[req];
+        auto seq_len = req_lens[req];
+        auto o = o_buf->slice({{0, token_offset, seq_len}});
+        auto q = qkv_buf->slice({{0, token_offset, seq_len}, {1, 0, nh}})
+                     ->permute({1, 0, 2});
+        auto k = qkv_buf->slice({{0, token_offset, seq_len}, {1, nh, nkvh}})
+                     ->permute({1, 0, 2});
+        auto v = qkv_buf->slice({{0, token_offset, seq_len}, {1, nh + nkvh, nkvh}})
+                     ->permute({1, 0, 2});
+        auto k_cache = kv_caches[req]->k[idev][0];
+        auto v_cache = kv_caches[req]->v[idev][0];
+        RUN_INFINI(infiniopCreateAttentionDescriptor(
+            rsrc.handle, &desc_attns[req], o->desc()->get(), q->desc()->get(),
+            k->desc()->get(), v->desc()->get(), k_cache->desc()->get(),
+            v_cache->desc()->get(), past_len));
+        RUN_INFINI(
+            infiniopGetAttentionWorkspaceSize(desc_attns[req], &temp_size));
+        workspace_size = std::max(workspace_size, temp_size);
+        token_offset += seq_len;
+    }
+    // MLP descriptors
+    infiniopGemmDescriptor_t desc_ffn_gate_up, desc_ffn_down;
+    infiniopSwiGLUDescriptor_t desc_swiglu;
+    RUN_INFINI(infiniopCreateGemmDescriptor(
+        rsrc.handle, &desc_ffn_gate_up, gate_up_buf->desc()->get(),
+        logits_out->desc()->get(), rsrc.w_ffn_gate_up[0]->desc()->get()));
+    RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_ffn_gate_up, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    auto gate_buf = gate_up_buf->slice(1, 0, di);
+    auto up_buf = gate_up_buf->slice(1, di, di);
+    RUN_INFINI(infiniopCreateSwiGLUDescriptor(
+        rsrc.handle, &desc_swiglu, logits_out->desc()->get(), up_buf->desc()->get(), gate_buf->desc()->get()));
+    RUN_INFINI(infiniopGetSwiGLUWorkspaceSize(desc_swiglu, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    RUN_INFINI(infiniopCreateGemmDescriptor(
+        rsrc.handle, &desc_ffn_down, logits_in->desc()->get(),
+        logits_out->desc()->get(), rsrc.w_ffn_down[0]->desc()->get()));
+    RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_ffn_down, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    // Output and sample
+    infiniopRMSNormDescriptor_t desc_norm_out;
+    RUN_INFINI(infiniopCreateRMSNormDescriptor(
+        rsrc.handle, &desc_norm_out, logits_out->slice(0, 0, 1)->desc()->get(),
+        logits_out->slice(0, 0, 1)->desc()->get(),
+        rsrc.w_out_norm->desc()->get(), meta.epsilon));
+    RUN_INFINI(infiniopGetRMSNormWorkspaceSize(desc_norm_out, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    infiniopGemmDescriptor_t desc_out_embd;
+    RUN_INFINI(infiniopCreateGemmDescriptor(
+        rsrc.handle, &desc_out_embd, prob_buf->desc()->get(),
+        logits_out->slice(0, 0, nreq)->desc()->get(),
+        rsrc.w_out_embd->desc()->get()));
+    RUN_INFINI(infiniopGetGemmWorkspaceSize(desc_out_embd, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    infiniopRandomSampleDescriptor_t desc_sample;
+    RUN_INFINI(infiniopCreateRandomSampleDescriptor(
+        rsrc.handle, &desc_sample,
+        TensorDesc::create(INFINI_DTYPE_U64, {1}, {1})->get(),
+        TensorDesc::create(dt_logits, {dvoc}, {1})->get()));
+    RUN_INFINI(infiniopGetRandomSampleWorkspaceSize(desc_sample, &temp_size));
+    workspace_size = std::max(workspace_size, temp_size);
+    // Allocate workspace
+    RUN_INFINI(infinirtMallocAsync(&workspace, workspace_size, stream));
+    for (uint32_t layer = 0; layer < nlayer; layer++) {
+        // 1. Attention
+        // rms norm
+        RUN_INFINI(infiniopRMSNorm(
+            desc_norm, workspace, workspace_size,
+            logits_out->data(), logits_in->data(),
+            rsrc.w_attn_norm[layer]->data(), stream));
+        // qkv_proj
+        RUN_INFINI(infiniopGemm(
+            desc_attn_qkv, workspace, workspace_size,
+            qkv_buf->data(), logits_out->data(),
+            rsrc.w_attn_qkv[layer]->data(), 1.0, 0.0, stream));
+        // rope
+        RUN_INFINI(infiniopRoPE(
+            desc_rope_q, workspace, workspace_size,
+            qkv_buf->data(), qkv_buf->data(),
+            pos_ids_buf->data(),
+            rsrc.sin_table->data(),
+            rsrc.cos_table->data(), stream));
+        RUN_INFINI(infiniopRoPE(
+            desc_rope_k, workspace, workspace_size,
+            qkv_buf->data(nh * dh), qkv_buf->data(nh * dh),
+            pos_ids_buf->data(),
+            rsrc.sin_table->data(),
+            rsrc.cos_table->data(),
+            stream));
+        size_t token_offset = 0;
+        for (uint32_t req = 0; req < nreq; req++) {
+            auto seq_len = req_lens[req];
+            // self attention
+            RUN_INFINI(infiniopAttention(
+                desc_attns[req], workspace, workspace_size,
+                o_buf->data(token_offset * nh * dh),
+                qkv_buf->data(token_offset * (nh + nkvh * 2) * dh),
+                qkv_buf->data(token_offset * (nh + nkvh * 2) * dh + nh * dh),
+                qkv_buf->data(token_offset * (nh + nkvh * 2) * dh + (nh + nkvh) * dh),
+                kv_caches[req]->k[idev][layer]->data(),
+                kv_caches[req]->v[idev][layer]->data(),
+                stream));
+            token_offset += seq_len;
+        }
+        // o_proj
+        RUN_INFINI(infiniopGemm(
+            desc_attn_o, workspace, workspace_size,
+            logits_in->data(), o_buf->data(),
+            rsrc.w_attn_out[layer]->data(), 1.0, idev == 0 ? 1.0 : 0.0, stream)); // only rank 0 adds residual
+        // All_reduce if distributed
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(
+                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+                INFINICCL_SUM, rsrc.comm, stream));
+        }
+        // 2. FFN
+        // rms_norm
+        RUN_INFINI(infiniopRMSNorm(
+            desc_norm, workspace, workspace_size,
+            logits_out->data(), logits_in->data(),
+            rsrc.w_ffn_norm[layer]->data(), stream));
+        // mlp
+        RUN_INFINI(infiniopGemm(
+            desc_ffn_gate_up, workspace, workspace_size,
+            gate_up_buf->data(), logits_out->data(), rsrc.w_ffn_gate_up[layer]->data(),
+            1.0, 0.0, stream));
+        RUN_INFINI(infiniopSwiGLU(
+            desc_swiglu, workspace, workspace_size,
+            logits_out->data(), up_buf->data(), gate_buf->data(), stream));
+        RUN_INFINI(infiniopGemm(
+            desc_ffn_down, workspace, workspace_size,
+            logits_in->data(), logits_out->data(),
+            rsrc.w_ffn_down[layer]->data(), 1.0, idev == 0 ? 1.0 : 0.0, stream)); // only rank 0 adds residual
+        // All_reduce if distributed
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(
+                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+                INFINICCL_SUM, rsrc.comm, stream));
+        }
+    }
+    // Sample and Output
+    uint64_t tmp;
+    if (idev == 0) {
+        size_t token_offset = 0;
+        for (uint32_t req = 0; req < nreq; req++) {
+            auto seq_len = req_lens[req];
+            token_offset += seq_len;
+            RUN_INFINI(infiniopRMSNorm(
+                desc_norm_out, workspace, workspace_size,
+                logits_out->data(req * d),
+                logits_in->data((token_offset - 1) * d),
+                rsrc.w_out_norm->data(), stream));
+        }
+        RUN_INFINI(infiniopGemm(
+            desc_out_embd, workspace, workspace_size,
+            prob_buf->data(), logits_out->data(),
+            rsrc.w_out_embd->data(), 1.0, 0.0, stream));
+        std::random_device _rd;
+        std::mt19937 gen(_rd());
+        token_offset = 0;
+        for (uint32_t req = 0; req < nreq; req++) {
+            auto seq_len = req_lens[req];
+            float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
+            RUN_INFINI(infiniopRandomSample(
+                desc_sample, workspace, workspace_size,
+                result_buf->data(req),
+                prob_buf->data(req * dvoc), random_val, topp,
+                topk, temperature, stream));
+            token_offset += seq_len;
+        }
+        RUN_INFINI(infinirtStreamSynchronize(stream));
+        RUN_INFINI(infinirtMemcpy(&tmp, result_buf->data(),
+                                  sizeof(uint64_t) * nreq, INFINIRT_MEMCPY_D2H));
+        for (uint32_t req = 0; req < nreq; req++) {
+            ans[req] = (uint32_t)result_cpu[req];
+        }
+    }
+    // Clean up
+    infiniopDestroyRMSNormDescriptor(desc_norm);
+    infiniopDestroyGemmDescriptor(desc_attn_qkv);
+    infiniopDestroyGemmDescriptor(desc_attn_o);
+    infiniopDestroyRoPEDescriptor(desc_rope_q);
+    infiniopDestroyRoPEDescriptor(desc_rope_k);
+    for (uint32_t req = 0; req < nreq; req++) {
+        infiniopDestroyAttentionDescriptor(desc_attns[req]);
+    }
+    infiniopDestroyRMSNormDescriptor(desc_norm_out);
+    infiniopDestroyGemmDescriptor(desc_out_embd);
+    infiniopDestroyRandomSampleDescriptor(desc_sample);
+    infinirtFree(workspace);
+}
+__C void
+inferBatch(struct JiugeModel *model,
+           const uint32_t *tokens, uint32_t ntok,
+           const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+           struct KVCache **kv_caches,
+           uint32_t *ans,
+           float temperature, uint32_t topk, float topp) {
+    model->req.tokens = tokens;
+    model->req.ntok = ntok;
+    model->req.req_lens = req_lens;
+    model->req.nreq = nreq;
+    model->req.req_pos = req_pos;
+    model->req.kv_caches = kv_caches;
+    model->req.ans = ans;
+    model->req.temperature = temperature;
+    model->req.topk = topk;
+    model->req.topp = topp;
+    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].proceed = true;
+        lock.unlock();
+        model->states[idev].cv.notify_one();
+    }
+}
+void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
+                  infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
+    createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
+    while (true) {
+        std::unique_lock<std::mutex> lock(state.mtx);
+        state.cv.wait(lock, [&] { return state.proceed || state.exit_flag; });
+        if (state.exit_flag) {
+            break;
+        }
+        inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok, req.req_lens, req.nreq, req.req_pos, req.kv_caches, req.ans, req.temperature, req.topk, req.topp);
+        state.proceed = false;
+        lock.unlock();
+    }
+    infiniopDestroyHandle(rsrc->handle);
+    infinirtStreamDestroy(rsrc->stream);
+    infinicclCommDestroy(rsrc->comm);
+}
+JiugeModel::JiugeModel(const JiugeMeta *_meta, const JiugeWeights *weights, infiniDevice_t device, std::vector<int> device_ids) : meta(*_meta) {
+    int ndev = int(device_ids.size());
+    dev_ids = device_ids;
+    dev_resources = std::vector<DeviceResource>(ndev);
+    states = std::vector<InferState>(ndev);
+    threads.resize(ndev);
+    RUN_INFINI(infinirtInit());
+    auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
+    if (ndev > 1) {
+        RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
+    }
+    for (int i = 0; i < ndev; i++) {
+        threads[i] = std::thread(launchDevice, std::cref(meta), weights, &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
+    }
+}
+__C struct JiugeModel *
+createJiugeModel(const JiugeMeta *meta,
+                 const JiugeWeights *weights,
+                 infiniDevice_t device,
+                 int ndev,
+                 const int *dev_ids) {
+    std::vector<int> device_ids(ndev);
+    std::copy(dev_ids, dev_ids + ndev, device_ids.begin());
+    JiugeModel *model = new JiugeModel(meta, weights, device, device_ids);
+    return model;
+}
+__C void destroyJiugeModel(struct JiugeModel *model) {
+    auto ndev = model->dev_resources.size();
+    for (size_t idev = 0; idev < ndev; idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].exit_flag = true;
+        lock.unlock();
+        model->states[idev].cv.notify_one();
+    }
+    for (size_t idev = 0; idev < ndev; idev++) {
+        model->threads[idev].join();
+    }
+    delete model;
+}
\ No newline at end of file
--- a/src/models/jiuge/jiuge_impl.hpp
+++ b/src/models/jiuge/jiuge_impl.hpp
+#ifndef JIUGE_IMPL_H
+#define JIUGE_IMPL_H
+#include "infinicore_infer.h"
+#include "../../tensor.hpp"
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+struct DeviceResource {
+    // Device
+    infiniDevice_t device;
+    int device_id;
+    infiniopHandle_t handle;
+    // Weights
+    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
+        cos_table;
+    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
+        w_ffn_norm, w_ffn_gate_up, w_ffn_down;
+    // Streams
+    infinirtStream_t stream;
+    infinicclComm_t comm;
+};
+struct InferState {
+    std::mutex mtx;
+    std::condition_variable cv;
+    bool proceed = false;
+    bool exit_flag = false;
+};
+struct InferRequest {
+    const uint32_t *tokens;
+    uint32_t ntok;
+    const uint32_t *req_lens;
+    uint32_t nreq;
+    const uint32_t *req_pos;
+    struct KVCache **kv_caches;
+    uint32_t *ans;
+    float temperature;
+    uint32_t topk;
+    float topp;
+};
+struct JiugeModel {
+    JiugeMeta meta;
+    infiniDevice_t device;
+    std::vector<int> dev_ids;
+    std::vector<DeviceResource> dev_resources;
+    std::vector<InferState> states;
+    std::vector<std::thread> threads;
+    InferRequest req;
+    JiugeModel(const JiugeMeta *, const JiugeWeights *, infiniDevice_t device, std::vector<int> device_ids);
+};
+struct KVCache {
+    std::vector<std::vector<std::shared_ptr<Tensor>>> k, v;
+};
+#endif
--- a/src/models/jiuge/jiuge_kv_cache.cpp
+++ b/src/models/jiuge/jiuge_kv_cache.cpp
+#include "jiuge_impl.hpp"
+__C struct KVCache *createKVCache(const JiugeModel *model) {
+    KVCache *cache = new KVCache();
+    auto ndev = model->dev_resources.size();
+    auto nkvh = model->meta.nkvh / ndev;
+    auto max_len = model->meta.dctx;
+    auto dh = model->meta.dh;
+    auto shape = std::vector<size_t>{nkvh, max_len, dh};
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        auto kcache = std::vector<std::shared_ptr<Tensor>>();
+        auto vcache = std::vector<std::shared_ptr<Tensor>>();
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            kcache.push_back(std::move(Tensor::buffer(model->meta.dt_mat, shape)));
+            vcache.push_back(std::move(Tensor::buffer(model->meta.dt_mat, shape)));
+        }
+        cache->k.push_back(kcache);
+        cache->v.push_back(vcache);
+    }
+    return cache;
+}
+__C struct KVCache *duplicateKVCache(const JiugeModel *model,
+                                     const KVCache *kv_cache,
+                                     unsigned int seq_len) {
+    auto new_kv_cache = createKVCache(model);
+    auto ndev = model->dev_resources.size();
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            new_kv_cache->k[idev][layer]
+                ->slice(1, 0, seq_len)
+                ->copy_from(kv_cache->k[idev][layer]->slice(1, 0, seq_len),
+                            model->dev_resources[idev].handle);
+            new_kv_cache->v[idev][layer]
+                ->slice(1, 0, seq_len)
+                ->copy_from(kv_cache->v[idev][layer]->slice(1, 0, seq_len),
+                            model->dev_resources[idev].handle);
+        }
+    }
+    return new_kv_cache;
+}
+__C void dropKVCache(JiugeModel const *model, KVCache *kv_cache) {
+    auto ndev = model->dev_resources.size();
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            kv_cache->k[idev][layer].reset();
+            kv_cache->v[idev][layer].reset();
+        }
+    }
+    delete kv_cache;
+}
--- a/src/models/jiuge/jiuge_weight.hpp
+++ b/src/models/jiuge/jiuge_weight.hpp
+#ifndef JIUGE_WEIGHT_HPP
+#define JIUGE_WEIGHT_HPP
+#include "jiuge_impl.hpp"
+#include <cmath>
+inline std::shared_ptr<Tensor> get_in_embd(
+    JiugeMeta const *meta,
+    JiugeWeights const *w) {
+    auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+    return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape);
+}
+inline std::shared_ptr<Tensor> get_out_norm(
+    JiugeMeta const *meta,
+    JiugeWeights const *w) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)w->output_norm, meta->dt_norm, shape);
+}
+inline std::shared_ptr<Tensor> get_out_embd(
+    JiugeMeta const *meta,
+    JiugeWeights const *w) {
+    auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+    return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape)
+        ->permute({1, 0});
+}
+inline std::shared_ptr<Tensor> get_attn_norm(
+    JiugeMeta const *meta,
+    JiugeWeights const *w,
+    size_t layer) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)(w->attn_norm[layer]), meta->dt_norm, shape);
+}
+inline std::shared_ptr<Tensor> get_attn_qkv(
+    JiugeMeta const *meta,
+    JiugeWeights const *w,
+    size_t layer, size_t idev, size_t ndev) {
+    auto nkvh = meta->nkvh;
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    auto d = meta->d;
+    size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * d * dsize(meta->dt_mat);
+    auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh, d});
+    return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, meta->dt_mat, shape)
+        ->permute({1, 0});
+}
+inline std::shared_ptr<Tensor> get_attn_qkv_bias(
+    JiugeMeta const *meta,
+    JiugeWeights const *w,
+    size_t layer, size_t idev, size_t ndev) {
+    auto nkvh = meta->nkvh;
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * dsize(meta->dt_mat);
+    auto shape = std::vector<size_t>({1, (nh + 2 * nkvh) / ndev * dh});
+    return Tensor::weight((char *)(w->attn_qkv_b[layer]) + offset, meta->dt_mat, shape);
+}
+inline std::shared_ptr<Tensor> get_attn_o(JiugeMeta const *meta,
+                                          JiugeWeights const *w, size_t layer,
+                                          size_t idev, size_t ndev) {
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    auto d = meta->d;
+    size_t offset = idev * d * (nh / ndev * dh) * dsize(meta->dt_mat);
+    auto shape = std::vector<size_t>({d, nh / ndev * dh});
+    return Tensor::weight((char *)(w->attn_o[layer]) + offset, meta->dt_mat, shape)
+        ->permute({1, 0});
+}
+inline std::shared_ptr<Tensor> get_ffn_norm(
+    JiugeMeta const *meta,
+    JiugeWeights const *w,
+    size_t layer) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)(w->ffn_norm[layer]), meta->dt_norm, shape);
+}
+inline std::shared_ptr<Tensor> get_ffn_gate_up(
+    JiugeMeta const *meta,
+    JiugeWeights const *w,
+    size_t layer, size_t idev, size_t ndev) {
+    auto di = meta->di;
+    auto d = meta->d;
+    size_t offset = idev * (2 * di / ndev) * d * dsize(meta->dt_mat);
+    auto shape = std::vector<size_t>({2 * di / ndev, d});
+    return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset,
+                          meta->dt_mat, shape)
+        ->permute({1, 0});
+}
+inline std::shared_ptr<Tensor> get_ffn_down(
+    JiugeMeta const *meta,
+    JiugeWeights const *w,
+    size_t layer, size_t idev, size_t ndev) {
+    auto di = meta->di;
+    auto d = meta->d;
+    size_t offset = idev * d * (di / ndev) * dsize(meta->dt_mat);
+    auto shape = std::vector<size_t>({d, di / ndev});
+    return Tensor::weight((char *)(w->ffn_down[layer]) + offset, meta->dt_mat, shape)
+        ->permute({1, 0});
+}
+inline std::shared_ptr<Tensor> get_sin_table(JiugeMeta const *meta) {
+    float *table = (float *)std::malloc(meta->dctx * meta->dh * sizeof(float));
+    auto half_dh = meta->dh / 2;
+    for (size_t i = 0; i < meta->dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float _sin = std::sin(
+                static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
+            table[i * meta->dh + 2 * j] = _sin;
+            table[i * meta->dh + 2 * j + 1] = _sin;
+        }
+    }
+    auto shape = std::vector<size_t>({meta->dctx, meta->dh});
+    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+    std::free(table);
+    return tensor;
+}
+inline std::shared_ptr<Tensor> get_cos_table(JiugeMeta const *meta) {
+    float *table = (float *)std::malloc(meta->dctx * meta->dh * sizeof(float));
+    auto half_dh = meta->dh / 2;
+    for (size_t i = 0; i < meta->dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float _cos = std::cos(
+                static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
+            table[i * meta->dh + 2 * j] = _cos;
+            table[i * meta->dh + 2 * j + 1] = _cos;
+        }
+    }
+    auto shape = std::vector<size_t>({meta->dctx, meta->dh});
+    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+    std::free(table);
+    return tensor;
+}
+#endif
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
+#ifndef INFER_TENSOR_H
+#define INFER_TENSOR_H
+#include "infinicore_infer.h"
+#include "utils.hpp"
+#include <memory>
+#include <string>
+#include <vector>
+struct Storage {
+    void *memory;
+    size_t size;
+    infiniDevice_t device_type;
+    int device_id;
+    static std::shared_ptr<Storage> create(size_t size);
+    static std::shared_ptr<Storage> createAsync(size_t size, infinirtStream_t stream = nullptr);
+    static std::shared_ptr<Storage> createHost(size_t size);
+    ~Storage();
+};
+struct SliceParams {
+    size_t dim;
+    size_t start;
+    size_t len;
+};
+class TensorDesc {
+private:
+    infiniopTensorDescriptor_t _desc;
+public:
+    static std::shared_ptr<TensorDesc>
+    create(infiniDtype_t dtype, const std::vector<size_t> &shape,
+           const std::vector<ptrdiff_t> &strides);
+    infiniopTensorDescriptor_t get() const { return _desc; };
+    ~TensorDesc();
+};
+class Tensor : public std::enable_shared_from_this<Tensor> {
+private:
+    infiniDtype_t _dtype;
+    std::vector<size_t> _shape;
+    std::vector<ptrdiff_t> _strides;
+    void *_data;
+    ptrdiff_t _offset;
+    size_t _size;
+    std::shared_ptr<Storage> storage;
+    infiniopTensorDescriptor_t _desc;
+    void *data_impl(ptrdiff_t offset) const;
+    std::shared_ptr<Tensor>
+    slice_impl(const std::vector<SliceParams> &slices) const;
+public:
+    static std::shared_ptr<Tensor> buffer(infiniDtype_t dtype,
+                                          const std::vector<size_t> &shape,
+                                          infinirtStream_t stream = nullptr);
+    static std::shared_ptr<Tensor> weight(void *host_data,
+                                          infiniDtype_t dtype,
+                                          const std::vector<size_t> &shape);
+    std::shared_ptr<Tensor> slice(size_t dim, size_t start, size_t len);
+    std::shared_ptr<Tensor const> slice(size_t dim, size_t start,
+                                        size_t len) const;
+    std::shared_ptr<Tensor> slice(const std::vector<SliceParams> &slices);
+    std::shared_ptr<Tensor const>
+    slice(const std::vector<SliceParams> &slices) const;
+    std::shared_ptr<Tensor> dim_merge(size_t dim_start, size_t dim_end);
+    std::shared_ptr<Tensor> dim_split(size_t dim,
+                                      const std::vector<size_t> &dims);
+    std::shared_ptr<Tensor> permute(const std::vector<size_t> &order);
+    void *data(ptrdiff_t offset = 0);
+    void const *data(ptrdiff_t offset = 0) const;
+    void copy_from(std::shared_ptr<Tensor const> src, infiniopHandle_t handle,
+                   infinirtStream_t stream = nullptr);
+    const std::vector<size_t> &shape() const;
+    const std::vector<ptrdiff_t> &strides() const;
+    size_t ndim() const;
+    infiniDtype_t dtype() const;
+    std::shared_ptr<TensorDesc> desc() const;
+    size_t byte_size() const;
+    ptrdiff_t data_offset() const;
+    infiniDevice_t device_type() const;
+    int device_id() const;
+    bool is_contigous() const;
+    void debug(const std::string &filename) const;
+    void debug() const;
+    ~Tensor();
+};
+inline size_t dsize(infiniDtype_t dtype) {
+    switch (dtype) {
+    case INFINI_DTYPE_INVALID:
+        return 0;
+    case INFINI_DTYPE_BYTE:
+        return 1;
+    case INFINI_DTYPE_BOOL:
+        return 1;
+    case INFINI_DTYPE_I8:
+        return 1;
+    case INFINI_DTYPE_I16:
+        return 2;
+    case INFINI_DTYPE_I32:
+        return 4;
+    case INFINI_DTYPE_I64:
+        return 8;
+    case INFINI_DTYPE_U8:
+        return 1;
+    case INFINI_DTYPE_U16:
+        return 2;
+    case INFINI_DTYPE_U32:
+        return 4;
+    case INFINI_DTYPE_U64:
+        return 8;
+    case INFINI_DTYPE_F8:
+        return 1;
+    case INFINI_DTYPE_F16:
+        return 2;
+    case INFINI_DTYPE_F32:
+        return 4;
+    case INFINI_DTYPE_F64:
+        return 8;
+    case INFINI_DTYPE_C16:
+        return 2;
+    case INFINI_DTYPE_C32:
+        return 4;
+    case INFINI_DTYPE_C64:
+        return 8;
+    case INFINI_DTYPE_C128:
+        return 16;
+    case INFINI_DTYPE_BF16:
+        return 2;
+    default:
+        return 0;
+    }
+}
+#endif
--- a/src/tensor/strorage.cpp
+++ b/src/tensor/strorage.cpp
+#include "../tensor.hpp"
+std::shared_ptr<Storage> Storage::create(size_t size) {
+    auto storage = std::make_shared<Storage>();
+    RUN_INFINI(infinirtMalloc(&storage->memory, size));
+    storage->size = size;
+    RUN_INFINI(infinirtGetDevice(&storage->device_type, &storage->device_id));
+    return storage;
+}
+std::shared_ptr<Storage> Storage::createAsync(size_t size, infinirtStream_t stream) {
+    auto storage = std::make_shared<Storage>();
+    RUN_INFINI(infinirtMallocAsync(&storage->memory, size, stream));
+    storage->size = size;
+    RUN_INFINI(infinirtGetDevice(&storage->device_type, &storage->device_id));
+    return storage;
+}
+std::shared_ptr<Storage> Storage::createHost(size_t size) {
+    auto storage = std::make_shared<Storage>();
+    RUN_INFINI(infinirtMallocHost(&storage->memory, size));
+    storage->size = size;
+    storage->device_type = INFINI_DEVICE_CPU;
+    storage->device_id = 0;
+    return storage;
+}
+Storage::~Storage() {
+    if (device_type == INFINI_DEVICE_CPU) {
+        RUN_INFINI(infinirtFreeHost(memory));
+    } else {
+        RUN_INFINI(infinirtFree(memory));
+    }
+}
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
+#include "../tensor.hpp"
+#include "../utils.hpp"
+#include <fstream>
+#include <iostream>
+#include <numeric>
+std::shared_ptr<TensorDesc>
+TensorDesc::create(infiniDtype_t dtype, const std::vector<size_t> &shape,
+                   const std::vector<ptrdiff_t> &strides) {
+    auto desc = std::make_shared<TensorDesc>();
+    infiniopCreateTensorDescriptor(&desc->_desc, shape.size(), shape.data(),
+                                   strides.data(), dtype);
+    return desc;
+}
+TensorDesc::~TensorDesc() {
+    infiniopDestroyTensorDescriptor(this->_desc);
+}
+const std::vector<size_t> &Tensor::shape() const { return this->_shape; }
+const std::vector<ptrdiff_t> &Tensor::strides() const { return this->_strides; }
+size_t Tensor::ndim() const { return this->_shape.size(); }
+infiniDtype_t Tensor::dtype() const { return this->_dtype; }
+size_t Tensor::byte_size() const { return this->_size; }
+infiniDevice_t Tensor::device_type() const { return this->storage->device_type; }
+int Tensor::device_id() const { return this->storage->device_id; }
+Tensor::~Tensor() {}
+ptrdiff_t Tensor::data_offset() const {
+    return (char *)(this->_data) - (char *)(this->storage->memory);
+}
+std::shared_ptr<TensorDesc> Tensor::desc() const { return TensorDesc::create(this->_dtype, this->_shape, this->_strides); }
+std::shared_ptr<Tensor> Tensor::buffer(infiniDtype_t dtype,
+                                       const std::vector<size_t> &shape,
+                                       infinirtStream_t stream) {
+    std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
+    tensor->_dtype = dtype;
+    auto ndim = shape.size();
+    if (shape.empty()) {
+        tensor->_shape = std::vector<size_t>{1};
+        ndim = 1;
+    } else {
+        tensor->_shape = std::vector<size_t>(shape);
+    }
+    size_t size = std::accumulate(shape.begin(), shape.end(), dsize(dtype), std::multiplies<size_t>());
+    auto strides = std::vector<ptrdiff_t>(ndim);
+    strides[ndim - 1] = 1;
+    for (int i = ndim - 2; i >= 0; i--) {
+        strides[i] = strides[i + 1] * shape[i + 1];
+    }
+    tensor->_strides = strides;
+    tensor->storage = Storage::createAsync(size, stream);
+    tensor->_size = size;
+    tensor->_data = tensor->storage->memory;
+    infiniopCreateTensorDescriptor(&tensor->_desc, ndim, tensor->_shape.data(),
+                                   strides.data(), dtype);
+    tensor->_offset = 0;
+    return tensor;
+}
+std::shared_ptr<Tensor> Tensor::weight(void *data, infiniDtype_t dtype,
+                                       const std::vector<size_t> &shape) {
+    std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
+    ;
+    tensor->_dtype = dtype;
+    auto ndim = shape.size();
+    if (shape.empty()) {
+        tensor->_shape = std::vector<size_t>{1};
+        ndim = 1;
+    } else {
+        tensor->_shape = std::vector<size_t>(shape);
+    }
+    size_t size = std::accumulate(shape.begin(), shape.end(), dsize(dtype), std::multiplies<size_t>());
+    auto strides = std::vector<ptrdiff_t>(ndim);
+    strides[ndim - 1] = 1;
+    for (int i = ndim - 2; i >= 0; i--) {
+        strides[i] = strides[i + 1] * shape[i + 1];
+    }
+    tensor->_strides = strides;
+    tensor->storage = Storage::create(size);
+    RUN_INFINI(infinirtMemcpy(tensor->storage->memory,
+                              data, size, INFINIRT_MEMCPY_H2D));
+    tensor->_data = tensor->storage->memory;
+    tensor->_size = size;
+    infiniopCreateTensorDescriptor(&tensor->_desc, ndim, tensor->_shape.data(),
+                                   strides.data(), dtype);
+    tensor->_offset = 0;
+    return tensor;
+}
+void *Tensor::data_impl(ptrdiff_t offset) const {
+    ASSERT(offset * dsize(this->dtype()) < this->_size);
+    return (char *)(this->_data) + offset * dsize(this->dtype());
+}
+void *Tensor::data(ptrdiff_t offset) {
+    return this->data_impl(offset);
+}
+const void *Tensor::data(ptrdiff_t offset) const {
+    return this->data_impl(offset);
+}
+void Tensor::copy_from(std::shared_ptr<Tensor const> src,
+                       infiniopHandle_t handle, infinirtStream_t stream) {
+    ASSERT_EQ(this->shape(), src->shape());
+    ASSERT_EQ(this->dtype(), src->dtype());
+    infiniopRearrangeDescriptor_t desc;
+    RUN_INFINI(infiniopCreateRearrangeDescriptor(
+        handle, &desc, this->desc()->get(), src->desc()->get()));
+    RUN_INFINI(infiniopRearrange(desc, this->data(), src->data(),
+                                 stream));
+    RUN_INFINI(infiniopDestroyRearrangeDescriptor(desc));
+}
+bool Tensor::is_contigous() const {
+    auto ndim = this->ndim();
+    auto shape = this->shape();
+    auto strides = std::vector<ptrdiff_t>(ndim);
+    strides[ndim - 1] = 1;
+    for (int i = ndim - 2; i >= 0; i--) {
+        strides[i] = strides[i + 1] * shape[i + 1];
+    }
+    ASSERT_EQ(strides.size(), this->_strides.size());
+    return std::equal(strides.begin(), strides.end(), this->_strides.begin());
+}
+template <typename T>
+void print_data(T *data, const std::vector<size_t> &shape,
+                const std::vector<ptrdiff_t> &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << data[i] << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data(data + i * strides[dim], shape, strides, dim + 1);
+            std::cout << std::endl;
+        }
+    }
+}
+template <>
+void print_data(uint16_t const *data, const std::vector<size_t> &shape,
+                const std::vector<ptrdiff_t> &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << f16_to_f32(data[i * strides[dim]]) << " ";
+        }
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data(data + i * strides[dim], shape, strides, dim + 1);
+            std::cout << std::endl;
+        }
+    }
+}
+void Tensor::debug(const std::string &filename) const {
+    RUN_INFINI(
+        infinirtDeviceSynchronize());
+    std::cout << "Tensor: "
+              << "shape[ ";
+    for (auto s : this->shape()) {
+        std::cout << s << " ";
+    }
+    std::cout << "] strides[ ";
+    for (auto s : this->strides()) {
+        std::cout << s << " ";
+    }
+    std::cout << "] dtype=" << this->dtype()
+              << " device=" << this->device_type()
+              << " device_id=" << this->device_id() << std::endl;
+    auto dtype = this->dtype();
+    void const *cpu_data;
+    if (this->device_type() != INFINI_DEVICE_CPU) {
+        void *cpu_memory = std::malloc(this->storage->size);
+        RUN_INFINI(infinirtMemcpy(cpu_memory, this->storage->memory,
+                                  this->storage->size, INFINIRT_MEMCPY_D2H));
+        cpu_data = cpu_memory;
+    } else {
+        cpu_data = this->data();
+    }
+    if (!filename.empty()) {
+        std::ofstream outFile(filename, std::ios::binary);
+        if (!outFile) {
+            std::cerr << "Error opening file for writing: " << filename << "\n";
+            return;
+        }
+        outFile.write(reinterpret_cast<const char *>(cpu_data), this->storage->size);
+        outFile.close();
+        std::cout << "Data written to file: " << filename << "\n";
+        return;
+    }
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        print_data((uint16_t const *)((char const *)cpu_data + data_offset()),
+                   this->shape(), this->strides(), 0);
+        break;
+    case INFINI_DTYPE_F32:
+        print_data((float const *)((char const *)cpu_data + data_offset()),
+                   this->shape(), this->strides(), 0);
+        break;
+    case INFINI_DTYPE_U64:
+        print_data((uint64_t const *)((char const *)cpu_data + data_offset()),
+                   this->shape(), this->strides(), 0);
+        break;
+    case INFINI_DTYPE_I64:
+        print_data((int64_t const *)((char const *)cpu_data + data_offset()),
+                   this->shape(), this->strides(), 0);
+        break;
+    case INFINI_DTYPE_U32:
+        print_data((uint32_t const *)((char const *)cpu_data + data_offset()),
+                   this->shape(), this->strides(), 0);
+        break;
+    case INFINI_DTYPE_I32:
+        print_data((int32_t const *)((char const *)cpu_data + data_offset()),
+                   this->shape(), this->strides(), 0);
+        break;
+    default:
+        PANIC("Unsupported data type");
+    }
+}
+void Tensor::debug() const { this->debug(""); }
--- a/src/tensor/transform.cpp
+++ b/src/tensor/transform.cpp
+#include "../tensor.hpp"
+#include "../utils.hpp"
+#include <algorithm>
+#include <numeric>
+#include <vector>
+std::shared_ptr<Tensor> Tensor::slice_impl(const std::vector<SliceParams> &slices) const {
+    std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
+    auto new_shape = std::vector<size_t>(this->_shape);
+    ptrdiff_t offset = 0;
+    for (const auto &slice : slices) {
+        ASSERT(slice.len > 0);
+        ASSERT(this->_shape[slice.dim] >= slice.start + slice.len);
+        new_shape[slice.dim] = slice.len;
+        offset += slice.start * this->_strides[slice.dim];
+    }
+    tensor->_dtype = this->_dtype;
+    tensor->_shape = new_shape;
+    tensor->_strides = std::vector<ptrdiff_t>(this->_strides);
+    tensor->_offset = offset * dsize(this->_dtype);
+    tensor->_data = static_cast<char *>(this->_data) + tensor->_offset;
+    tensor->_size = std::accumulate(new_shape.begin(), new_shape.end(),
+                                    dsize(this->_dtype), std::multiplies<size_t>());
+    tensor->storage = this->storage;
+    infiniopCreateTensorDescriptor(&tensor->_desc, tensor->_shape.size(), tensor->_shape.data(),
+                                   tensor->_strides.data(), tensor->_dtype);
+    return tensor;
+}
+std::shared_ptr<Tensor> Tensor::slice(size_t dim, size_t start, size_t len) {
+    return this->slice_impl({{dim, start, len}});
+}
+std::shared_ptr<Tensor const> Tensor::slice(size_t dim, size_t start, size_t len) const {
+    return this->slice_impl({{dim, start, len}});
+}
+std::shared_ptr<Tensor> Tensor::slice(const std::vector<SliceParams> &slices) {
+    return this->slice_impl(slices);
+}
+std::shared_ptr<Tensor const> Tensor::slice(const std::vector<SliceParams> &slices) const {
+    return this->slice_impl(slices);
+}
+std::shared_ptr<Tensor> Tensor::dim_merge(size_t dim_start, size_t dim_end) {
+    ASSERT(dim_start <= dim_end && dim_end < this->_shape.size());
+    if (dim_start == dim_end) {
+        return shared_from_this();
+    }
+    auto new_shape = std::vector<size_t>();
+    auto new_strides = std::vector<ptrdiff_t>();
+    for (size_t i = 0; i < dim_start; i++) {
+        new_shape.push_back(this->_shape[i]);
+        new_strides.push_back(this->_strides[i]);
+    }
+    for (size_t i = dim_start + 1; i <= dim_end; i++) {
+        ASSERT_EQ(this->_strides[i - 1], ptrdiff_t(this->_shape[i]) * this->_strides[i]);
+    }
+    new_shape.push_back(std::accumulate(this->_shape.begin() + dim_start, this->_shape.begin() + dim_end + 1, 1, std::multiplies<size_t>()));
+    new_strides.push_back(this->_strides[dim_end]);
+    for (size_t i = dim_end + 1; i < this->_shape.size(); i++) {
+        new_shape.push_back(this->_shape[i]);
+        new_strides.push_back(this->_strides[i]);
+    }
+    this->_shape = new_shape;
+    this->_strides = new_strides;
+    infiniopDestroyTensorDescriptor(this->_desc);
+    infiniopCreateTensorDescriptor(&this->_desc, this->_shape.size(), this->_shape.data(),
+                                   this->_strides.data(), this->_dtype);
+    return shared_from_this();
+}
+std::shared_ptr<Tensor> Tensor::dim_split(size_t dim, const std::vector<size_t> &dims) {
+    ASSERT_EQ(this->_shape[dim], std::accumulate(dims.begin(), dims.end(), size_t(1), std::multiplies<size_t>()));
+    auto new_shape = std::vector<size_t>();
+    auto new_strides = std::vector<ptrdiff_t>();
+    for (size_t i = 0; i < dim; i++) {
+        new_shape.push_back(this->_shape[i]);
+        new_strides.push_back(this->_strides[i]);
+    }
+    for (size_t i = 0; i < dims.size(); i++) {
+        new_shape.push_back(dims[i]);
+        new_strides.push_back(this->_strides[dim] * this->_shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<size_t>()));
+    }
+    for (size_t i = dim + 1; i < this->_shape.size(); i++) {
+        new_shape.push_back(this->_shape[i]);
+        new_strides.push_back(this->_strides[i]);
+    }
+    this->_shape = new_shape;
+    this->_strides = new_strides;
+    infiniopDestroyTensorDescriptor(this->_desc);
+    infiniopCreateTensorDescriptor(&this->_desc, this->_shape.size(), this->_shape.data(),
+                                   this->_strides.data(), this->_dtype);
+    return shared_from_this();
+}
+std::shared_ptr<Tensor> Tensor::permute(const std::vector<size_t> &order) {
+    ASSERT_EQ(this->_shape.size(), order.size());
+    auto new_shape = std::vector<size_t>(order.size());
+    auto new_strides = std::vector<ptrdiff_t>(order.size());
+    for (size_t i = 0; i < order.size(); i++) {
+        ASSERT(std::find(order.begin(), order.end(), i) != order.end());
+        new_shape[i] = this->_shape[order[i]];
+        new_strides[i] = this->_strides[order[i]];
+    }
+    this->_shape = new_shape;
+    this->_strides = new_strides;
+    infiniopDestroyTensorDescriptor(this->_desc);
+    infiniopCreateTensorDescriptor(&this->_desc, this->_shape.size(), this->_shape.data(),
+                                   this->_strides.data(), this->_dtype);
+    return shared_from_this();
+}
\ No newline at end of file
--- a/src/utils.hpp
+++ b/src/utils.hpp
+#ifndef INFINICORE_INFER_UTILS_H
+#define INFINICORE_INFER_UTILS_H
+#include <infinicore.h>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+inline void assert_true(int expr, const char *msg, const char *file, int line) {
+    if (!expr) {
+        fprintf(stderr, "\033[31mAssertion failed:\033[0m %s at file %s, line %d\n", msg, file, line);
+        exit(EXIT_FAILURE);
+    }
+}
+#define ASSERT(expr) assert_true((expr), #expr " is false", __FILE__, __LINE__)
+#define ASSERT_EQ(a, b) assert_true((a) == (b), #a " != " #b, __FILE__, __LINE__)
+#define ASSERT_VALID_PTR(a) assert_true((a) != nullptr, #a " is nullptr", __FILE__, __LINE__)
+#define PANIC(EXPR)                                             \
+    printf("Error at %s:%d - %s\n", __FILE__, __LINE__, #EXPR); \
+    exit(EXIT_FAILURE)
+#define RUN_INFINI(API)                                                         \
+    do {                                                                        \
+        auto api_result_ = (API);                                               \
+        if (api_result_ != INFINI_STATUS_SUCCESS) {                             \
+            std::cerr << "Error Code " << api_result_ << " in `" << #API << "`" \
+                      << " from " << __func__                                   \
+                      << " at " << __FILE__ << ":" << __LINE__ << std::endl;    \
+            exit(EXIT_FAILURE);                                                 \
+        }                                                                       \
+    } while (0)
+inline float f16_to_f32(uint16_t h) {
+    uint32_t sign = (h & 0x8000) << 16;  // Extract the sign bit
+    int32_t exponent = (h >> 10) & 0x1F; // Extract the exponent
+    uint32_t mantissa = h & 0x3FF;       // Extract the mantissa (fraction part)
+    if (exponent == 31) { // Special case for Inf and NaN
+        if (mantissa != 0) {
+            // NaN: Set float32 NaN
+            uint32_t f32 = sign | 0x7F800000 | (mantissa << 13);
+            return *(float *)&f32;
+        } else {
+            // Infinity
+            uint32_t f32 = sign | 0x7F800000;
+            return *(float *)&f32;
+        }
+    } else if (exponent == 0) { // Subnormal float16 or zero
+        if (mantissa == 0) {
+            // Zero (positive or negative)
+            uint32_t f32 = sign; // Just return signed zero
+            return *(float *)&f32;
+        } else {
+            // Subnormal: Convert to normalized float32
+            exponent = -14;                   // Set exponent for subnormal numbers
+            while ((mantissa & 0x400) == 0) { // Normalize mantissa
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF; // Clear the leading 1 bit
+            uint32_t f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+            return *(float *)&f32;
+        }
+    } else {
+        // Normalized float16
+        uint32_t f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+        return *(float *)&f32;
+    }
+}
+#endif
--- a/xmake.lua
+++ b/xmake.lua
+local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+target("infinicore_infer")
+    set_kind("shared")
+    add_includedirs(INFINI_ROOT.."/include")
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infiniop", "infinirt", "infiniccl")
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    add_files("src/models/*/*.cpp")
+    add_files("src/tensor/*.cpp")
+    add_includedirs("include")
+    set_installdir(INFINI_ROOT)
+    add_installfiles("include/infinicore_infer.h", {prefixdir = "include"})
+    add_installfiles("include/infinicore_infer/*.h", {prefixdir = "include/infinicore_infer"})
+target_end()