[T2-3-1]blkmjsian

- deepseek - jiuge 4B awq

[T2-3-1]blkmjsian
- deepseek - jiuge 4B awq
22804eaa · blkmjsian · GitHub · 5c6000ec · 22804eaa · 22804eaa
Unverified Commit 22804eaa authored Sep 02, 2025 by blkmjsian Committed by GitHub Sep 02, 2025
10 changed files
--- a/src/models/inference_context.cpp
+++ b/src/models/inference_context.cpp
@@ -2,12 +2,12 @@
 #include "../tensor.hpp"
 #include "../utils.hpp"

-InferenceContext::InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream)
-    : rsrc(rsrc), cache_manager(cache_manager), stream(stream) {}
+InferenceContext::InferenceContext(infiniopHandle_t op_handle_, std::shared_ptr<MemoryPool> memory_pool_, CacheManager *cache_manager, infinirtStream_t stream)
+    : op_handle(op_handle_), memory_pool(memory_pool_), cache_manager(cache_manager), stream(stream) {}

 void InferenceContext::ensure_workspace(size_t required_size) {
    if (required_size > current_workspace_size || !workspace_storage) {
-        workspace_storage = Storage::createFromPool(required_size, rsrc->memory_pool);
+        workspace_storage = Storage::createFromPool(required_size, memory_pool);
        current_workspace_size = required_size;
    }
 }
@@ -19,7 +19,7 @@ void InferenceContext::add(std::shared_ptr<Tensor> c,

    infiniopAddDescriptor_t desc;
    if (!cache_manager->getAddDescriptor(key, desc)) {
-        RUN_INFINI(infiniopCreateAddDescriptor(rsrc->handle, &desc, c->desc(), a->desc(), b->desc()));
+        RUN_INFINI(infiniopCreateAddDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
        cache_manager->putAddDescriptor(key, desc);
    }

@@ -42,7 +42,7 @@ void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y,
    infiniopRMSNormDescriptor_t desc;
    if (!cache_manager->getRMSNormDescriptor(key, desc)) {
        RUN_INFINI(infiniopCreateRMSNormDescriptor(
-            rsrc->handle, &desc, y->desc(), x->desc(), w->desc(), epsilon));
+            op_handle, &desc, y->desc(), x->desc(), w->desc(), epsilon));
        cache_manager->putRMSNormDescriptor(key, desc);
    }

@@ -64,7 +64,7 @@ void InferenceContext::gemm(std::shared_ptr<Tensor> c,

    infiniopGemmDescriptor_t desc;
    if (!cache_manager->getGemmDescriptor(key, desc)) {
-        RUN_INFINI(infiniopCreateGemmDescriptor(rsrc->handle, &desc, c->desc(), a->desc(), b->desc()));
+        RUN_INFINI(infiniopCreateGemmDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
        cache_manager->putGemmDescriptor(key, desc);
    }

@@ -84,7 +84,7 @@ void InferenceContext::rearrange(std::shared_ptr<Tensor> dst,

    infiniopRearrangeDescriptor_t desc;
    if (!cache_manager->getRearrangeDescriptor(key, desc)) {
-        RUN_INFINI(infiniopCreateRearrangeDescriptor(rsrc->handle, &desc, dst->desc(), src->desc()));
+        RUN_INFINI(infiniopCreateRearrangeDescriptor(op_handle, &desc, dst->desc(), src->desc()));
        cache_manager->putRearrangeDescriptor(key, desc);
    }

@@ -105,7 +105,7 @@ void InferenceContext::rope(std::shared_ptr<Tensor> q,
    infiniopRoPEDescriptor_t desc;
    if (!cache_manager->getRoPEDescriptor(key, desc)) {
        RUN_INFINI(infiniopCreateRoPEDescriptor(
-            rsrc->handle, &desc, q->desc(), k->desc(),
+            op_handle, &desc, q->desc(), k->desc(),
            pos->desc(), sin->desc(), cos->desc()));
        cache_manager->putRoPEDescriptor(key, desc);
    }
@@ -121,6 +121,32 @@ void InferenceContext::rope(std::shared_ptr<Tensor> q,
        sin->data(), cos->data(), stream));
 }

+void InferenceContext::rope_v2(std::shared_ptr<Tensor> q,
+                               std::shared_ptr<Tensor> k,
+                               std::shared_ptr<Tensor> pos,
+                               std::shared_ptr<Tensor> sin,
+                               std::shared_ptr<Tensor> cos) {
+    size_t key = CacheManager::createDescriptorKey(q, k, pos, sin, cos);
+
+    infiniopRoPEv2Descriptor_t desc;
+    if (!cache_manager->getRoPEv2Descriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateRoPEv2Descriptor(
+            op_handle, &desc, q->desc(), k->desc(),
+            pos->desc(), sin->desc(), cos->desc()));
+        cache_manager->putRoPEv2Descriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetRoPEv2WorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopRoPEv2(
+        desc, workspace, workspace_size,
+        q->data(), k->data(), pos->data(),
+        sin->data(), cos->data(), stream));
+}
+
 void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
                                     std::shared_ptr<Tensor> x) {
    size_t key = CacheManager::createDescriptorKey(y, x);
@@ -128,7 +154,7 @@ void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
    infiniopCausalSoftmaxDescriptor_t desc;
    if (!cache_manager->getCausalSoftmaxDescriptor(key, desc)) {
        RUN_INFINI(infiniopCreateCausalSoftmaxDescriptor(
-            rsrc->handle, &desc, y->desc(), x->desc()));
+            op_handle, &desc, y->desc(), x->desc()));
        cache_manager->putCausalSoftmaxDescriptor(key, desc);
    }

@@ -141,6 +167,31 @@ void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
                                     y->data(), x->data(), stream));
 }

+void InferenceContext::topkrouter(std::shared_ptr<Tensor> values,  // F32
+                                  std::shared_ptr<Tensor> indices, // I32
+                                  std::shared_ptr<Tensor> x,
+                                  std::shared_ptr<Tensor> correction_bias, // F32
+                                  float routed_scaling_factor,
+                                  size_t topk) {
+    size_t key = CacheManager::createDescriptorKey(values, indices, x, correction_bias);
+
+    infiniopTopkrouterDescriptor_t desc;
+    if (!cache_manager->getTopkrouterDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateTopkrouterDescriptor(
+            op_handle, &desc, x->desc(), correction_bias->desc()));
+        cache_manager->putTopkrouterDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetTopkrouterWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopTopkrouter(desc, workspace, workspace_size,
+                                  values->data(), indices->data(), x->data(), correction_bias->data(),
+                                  routed_scaling_factor, topk, stream));
+}
+
 void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
                              std::shared_ptr<Tensor> up,
                              std::shared_ptr<Tensor> gate) {
@@ -149,7 +200,7 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
    infiniopSwiGLUDescriptor_t desc;
    if (!cache_manager->getSwiGLUDescriptor(key, desc)) {
        RUN_INFINI(infiniopCreateSwiGLUDescriptor(
-            rsrc->handle, &desc, out->desc(), up->desc(), gate->desc()));
+            op_handle, &desc, out->desc(), up->desc(), gate->desc()));
        cache_manager->putSwiGLUDescriptor(key, desc);
    }

@@ -170,7 +221,7 @@ void InferenceContext::randomSample(std::shared_ptr<Tensor> out,
    infiniopRandomSampleDescriptor_t desc;
    if (!cache_manager->getRandomSampleDescriptor(key, desc)) {
        RUN_INFINI(infiniopCreateRandomSampleDescriptor(
-            rsrc->handle, &desc, out->desc(), prob->desc()));
+            op_handle, &desc, out->desc(), prob->desc()));
        cache_manager->putRandomSampleDescriptor(key, desc);
    }

@@ -209,8 +260,8 @@ void InferenceContext::linear(std::shared_ptr<Tensor> c,
            if (beta == 0.0) {
                gemm(c, a, b, alpha, 1.0);
            } else {
-                auto c_copy = Tensor::buffer(c->dtype(), c->shape(), rsrc->memory_pool);
-                c_copy->copyFrom(c, rsrc->handle, stream);
+                auto c_copy = Tensor::buffer(c->dtype(), c->shape(), memory_pool);
+                c_copy->copyFrom(c, op_handle, stream);
                gemm(c, a, b, alpha, beta);
                add(c, c, c_copy);
            }
@@ -231,3 +282,26 @@ void InferenceContext::linear(std::shared_ptr<Tensor> c,
        add(c, c, bias->view_as(c->shape(), strides));
    }
 }
+
+void InferenceContext::dequant(std::shared_ptr<Tensor> weight,
+                               std::shared_ptr<Tensor> in_w,
+                               std::shared_ptr<Tensor> in_s,
+                               std::shared_ptr<Tensor> in_z) {
+
+    size_t key = CacheManager::createDescriptorKey(weight, in_w, in_s, in_z);
+
+    infiniopDequantizeDescriptor_t desc;
+    if (!cache_manager->getDequantizeDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateDequantizeDescriptor(op_handle, &desc, weight->desc(), in_w->desc(), in_s->desc(), in_z->desc()));
+        cache_manager->putDequantizeDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetDequantizeWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopDequantize(
+        desc, workspace, workspace_size,
+        weight->data(), in_w->data(), in_s->data(), in_z->data(), 0, 0, 0, stream));
+}
--- a/src/models/inference_context.hpp
+++ b/src/models/inference_context.hpp
 #pragma once

-#include "cache_manager.hpp"
-#include "jiuge/jiuge_impl.hpp"
-#include "jiuge/jiuge_weight.hpp"
+#include "../cache_manager/opcache_manager.hpp"
+
 #include <cassert>

 struct InferenceContext {
-    DeviceResource *rsrc;
+    infiniopHandle_t op_handle;
+    std::shared_ptr<MemoryPool> memory_pool;
    CacheManager *cache_manager;
    infinirtStream_t stream;
    std::shared_ptr<Storage> workspace_storage;
    size_t current_workspace_size = 0;

-    InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream);
+    InferenceContext(infiniopHandle_t op_handle, std::shared_ptr<MemoryPool> memory_pool, CacheManager *cache_manager, infinirtStream_t stream);

    void ensure_workspace(size_t required_size);

@@ -34,8 +34,21 @@ struct InferenceContext {
              std::shared_ptr<Tensor> pos,
              std::shared_ptr<Tensor> sin,
              std::shared_ptr<Tensor> cos);
+    void rope_v2(std::shared_ptr<Tensor> q,
+                 std::shared_ptr<Tensor> k,
+                 std::shared_ptr<Tensor> pos,
+                 std::shared_ptr<Tensor> sin,
+                 std::shared_ptr<Tensor> cos);
    void causalSoftmax(std::shared_ptr<Tensor> y,
                       std::shared_ptr<Tensor> x);
+
+    void topkrouter(std::shared_ptr<Tensor> values,  // F32
+                    std::shared_ptr<Tensor> indices, // I32
+                    std::shared_ptr<Tensor> x,
+                    std::shared_ptr<Tensor> correction_bias, // F32
+                    float routed_scaling_factor,
+                    size_t topk);
+
    void swiglu(std::shared_ptr<Tensor> out,
                std::shared_ptr<Tensor> up,
                std::shared_ptr<Tensor> gate);
@@ -49,6 +62,10 @@ struct InferenceContext {
                float alpha, float beta,
                std::shared_ptr<Tensor> residual,
                std::shared_ptr<Tensor> bias);
+    void dequant(std::shared_ptr<Tensor> weight,
+                 std::shared_ptr<Tensor> in_w,
+                 std::shared_ptr<Tensor> in_s,
+                 std::shared_ptr<Tensor> in_z);
 };

 namespace {
@@ -88,10 +105,31 @@ inline void rope(std::shared_ptr<Tensor> q, std::shared_ptr<Tensor> k,
    getInferenceContext().rope(q, k, pos, sin, cos);
 }

+inline void rope_v2(std::shared_ptr<Tensor> q, std::shared_ptr<Tensor> k,
+                    std::shared_ptr<Tensor> pos, std::shared_ptr<Tensor> sin,
+                    std::shared_ptr<Tensor> cos) {
+    getInferenceContext().rope_v2(q, k, pos, sin, cos);
+}
+
 inline void causalSoftmax(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x) {
    getInferenceContext().causalSoftmax(y, x);
 }

+inline void topkrouter(std::shared_ptr<Tensor> values,  // F32
+                       std::shared_ptr<Tensor> indices, // I32
+                       std::shared_ptr<Tensor> x,
+                       std::shared_ptr<Tensor> correction_bias, // F32
+                       float routed_scaling_factor,
+                       size_t topk) {
+
+    getInferenceContext().topkrouter(values,  // F32
+                                     indices, // I32
+                                     x,
+                                     correction_bias, // F32
+                                     routed_scaling_factor,
+                                     topk);
+}
+
 inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up,
                   std::shared_ptr<Tensor> gate) {
    getInferenceContext().swiglu(out, up, gate);
@@ -107,3 +145,11 @@ inline void linear(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a,
                   std::shared_ptr<Tensor> residual, std::shared_ptr<Tensor> bias) {
    getInferenceContext().linear(c, a, b, alpha, beta, residual, bias);
 }
+
+inline void dequant_linear(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> x,
+                           std::shared_ptr<Tensor> w_w, std::shared_ptr<Tensor> w_s, std::shared_ptr<Tensor> w_z,
+                           float alpha, float beta, std::shared_ptr<Tensor> residual, std::shared_ptr<Tensor> bias) {
+    auto w = Tensor::buffer(x->dtype(), {x->shape()[1], out->shape()[1]}, getInferenceContext().memory_pool);
+    getInferenceContext().dequant(w, w_w, w_s, w_z);
+    getInferenceContext().linear(out, x, w, alpha, beta, residual, bias);
+}
--- a/src/models/jiuge/jiuge.cpp
+++ b/src/models/jiuge/jiuge.cpp
@@ -10,7 +10,7 @@
 #include <thread>
 #include <vector>

-void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
+void createDeviceResource(JiugeDeviceResource *rsrc, const JiugeMeta *meta,
                          const JiugeWeights *weights,
                          infiniDevice_t device, int idev,
                          int ndev, int dev_id,
@@ -44,7 +44,7 @@ void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,

    auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);

-    *rsrc = DeviceResource{
+    *rsrc = JiugeDeviceResource{
        device,
        dev_id,
        handle,
@@ -67,7 +67,7 @@ void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
    RUN_INFINI(infinirtDeviceSynchronize());
 }

-void releaseDeviceResource(DeviceResource &res) {
+void releaseDeviceResource(JiugeDeviceResource &res) {
    infinirtDeviceSynchronize();
    // Release individual Tensors
    res.w_in_embd.reset();
@@ -111,7 +111,7 @@ void releaseDeviceResource(DeviceResource &res) {
    res.comm = nullptr;
 }

-void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
+void inferDeviceBatch(const JiugeMeta &meta, JiugeDeviceResource &rsrc,
                      uint32_t idev, uint32_t ndev,
                      const uint32_t *tokens, uint32_t ntok,
                      const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
@@ -298,7 +298,7 @@ void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
 }

 __C void
-inferBatch(struct JiugeModel *model,
+inferBatchJiuge(struct JiugeModel *model,
           const uint32_t *tokens, uint32_t ntok,
           const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
           struct KVCache **kv_caches,
@@ -331,7 +331,7 @@ inferBatch(struct JiugeModel *model,
 }

 __C void
-forwardBatch(struct JiugeModel *model,
+forwardBatchJiuge(struct JiugeModel *model,
             const uint32_t *tokens, uint32_t ntok,
             const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
             struct KVCache **kv_caches,
@@ -362,16 +362,17 @@ forwardBatch(struct JiugeModel *model,
    }
 }

-void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
+void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, JiugeDeviceResource *rsrc, InferState &state, InferRequest &req,
                  infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
+    // Create Device Resource
+    createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
+
    CacheManager cache_manager(100);
-    InferenceContext ctx(rsrc, &cache_manager, rsrc->stream);
+    InferenceContext ctx(rsrc->handle, rsrc->memory_pool, &cache_manager, rsrc->stream);

    // Set the inference context for this thread
    setInferenceContext(&ctx);

-    // Create Device Resource
-    createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
    {
        std::unique_lock<std::mutex> lock(state.mtx);
        state.loaded = true;
@@ -406,7 +407,7 @@ JiugeModel::JiugeModel(const JiugeMeta *_meta, const JiugeWeights *weights, infi
    int ndev = int(device_ids.size());
    device = device_;
    dev_ids = device_ids;
-    dev_resources = std::vector<DeviceResource>(ndev);
+    dev_resources = std::vector<JiugeDeviceResource>(ndev);
    states = std::vector<InferState>(ndev);
    threads.resize(ndev);
    RUN_INFINI(infinirtInit());

--- a/src/models/jiuge/jiuge_impl.hpp
+++ b/src/models/jiuge/jiuge_impl.hpp
@@ -12,7 +12,7 @@
 #include <thread>
 #include <vector>

-struct DeviceResource {
+struct JiugeDeviceResource {
    // Device
    infiniDevice_t device;
    int device_id;
@@ -56,7 +56,7 @@ struct JiugeModel {
    JiugeMeta meta;
    infiniDevice_t device;
    std::vector<int> dev_ids;
-    std::vector<DeviceResource> dev_resources;
+    std::vector<JiugeDeviceResource> dev_resources;
    std::vector<InferState> states;
    std::vector<std::thread> threads;
    InferRequest req;
@@ -64,8 +64,6 @@ struct JiugeModel {
    JiugeModel(const JiugeMeta *, const JiugeWeights *, infiniDevice_t device, std::vector<int> device_ids);
 };

-struct KVCache {
-    std::vector<std::vector<std::shared_ptr<Tensor>>> k, v;
-};
+#include "../../cache.hpp"

 #endif
--- a/src/models/jiuge_awq/jiuge_awq.cpp
+++ b/src/models/jiuge_awq/jiuge_awq.cpp
+#include "jiuge_awq.hpp"
+
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+#include "../inference_context.hpp"
+
+#include <random>
+#include <thread>
+#include <vector>
+
+void createDeviceResource(DeviceResource *rsrc, const JiugeAWQMeta *meta,
+                          std::shared_ptr<JiugeAWQDeviceWeight> weights,
+                          infiniDevice_t device, int idev,
+                          int ndev, int dev_id,
+                          infinicclComm_t comm) {
+    RUN_INFINI(infinirtSetDevice(device, dev_id));
+    infiniopHandle_t handle;
+    infiniopCreateHandle(&handle);
+    infinirtStream_t stream;
+    infinirtStreamCreate(&stream);
+
+    auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);
+
+    *rsrc = DeviceResource{
+        device,
+        dev_id,
+        handle,
+        weights,
+        stream,
+        comm,
+        memory_pool,
+    };
+    RUN_INFINI(infinirtDeviceSynchronize());
+}
+
+void releaseDeviceResource(DeviceResource &res) {
+    infinirtDeviceSynchronize();
+    // Release individual Tensors
+
+    infiniopDestroyHandle(res.handle);
+    res.handle = nullptr;
+    infinirtStreamDestroy(res.stream);
+    res.stream = nullptr;
+    infinicclCommDestroy(res.comm);
+    res.comm = nullptr;
+}
+
+void inferDeviceBatch(const JiugeAWQMeta *meta, DeviceResource &rsrc,
+                      uint32_t idev, uint32_t ndev,
+                      const uint32_t *tokens, uint32_t ntok,
+                      const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                      struct KVCache **kv_caches,
+                      const float *temperature, const uint32_t *topk, const float *topp,
+                      uint32_t *output, void *last_logits) {
+    auto nlayer = meta->nlayer;
+    auto nkvh = meta->nkvh / ndev;
+    auto nh = meta->nh / ndev;
+    auto ngroup = nh / nkvh;
+    // auto dctx = meta.dctx;
+    auto dh = meta->dh;
+    auto d = meta->d;
+    auto dt_logits = meta->dt_logits;
+    auto di = meta->di / ndev;
+    auto dvoc = meta->dvoc;
+    auto stream = rsrc.stream;
+    auto weight = rsrc.weights;
+    bool has_qkv_bias = meta->has_qkv_bias;
+
+    // Allocate buffers
+    auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+    auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+    auto q_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
+    auto k_buf = Tensor::buffer(dt_logits, {ntok, nkvh * dh}, rsrc.memory_pool);
+    auto v_buf = Tensor::buffer(dt_logits, {ntok, nkvh * dh}, rsrc.memory_pool);
+
+    auto gate_buf = Tensor::buffer(dt_logits, {ntok, di}, rsrc.memory_pool);
+    auto up_buf = Tensor::buffer(dt_logits, {ntok, di}, rsrc.memory_pool);
+
+    auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
+    auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, rsrc.memory_pool);
+    auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool);
+    auto result_cpu = std::vector<int64_t>(nreq);
+
+    // Prepare inputs
+    auto batch_pos_ids = std::vector<uint32_t>(ntok);
+    size_t req_start = 0;
+    for (uint32_t req = 0; req < nreq; req++) {
+        for (uint32_t i = 0; i < req_lens[req]; i++) {
+            batch_pos_ids[req_start + i] = req_pos[req] + i;
+        }
+        req_start += req_lens[req];
+    }
+
+    std::shared_ptr<Tensor> pos_ids_buf;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
+    } else {
+        pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool);
+        RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
+                                       INFINIRT_MEMCPY_H2D, stream));
+    }
+    for (uint32_t i = 0; i < ntok; i++) {
+        RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
+                                       weight->w_in_embd->data(tokens[i] * d),
+                                       dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
+    }
+    // Attention
+    // attention inner
+    size_t max_qk_size = 0;
+    size_t max_seq_len = 0;
+
+    for (uint32_t req = 0; req < nreq; req++) {
+        auto past_len = req_pos[req];
+        auto seq_len = req_lens[req];
+        auto total_len = past_len + seq_len;
+
+        max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len));
+        max_seq_len = std::max(max_seq_len, size_t(seq_len));
+    }
+
+    auto qk_buf = Tensor::buffer(dt_logits, {nh, max_qk_size}, rsrc.memory_pool);
+    auto rearrange_q_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+    auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh});
+    auto attn_val_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+    auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh});
+
+    // Compute
+    for (uint32_t layer = 0; layer < nlayer; layer++) {
+        // 1. Attention
+        // rms norm
+        rmsnorm(logits_out, logits_in, weight->w_attn_norm[layer], meta->epsilon);
+        // qkv_proj
+        dequant_linear(q_buf, logits_out,
+                       weight->w_attn_q[layer]->w, weight->w_attn_q[layer]->s, weight->w_attn_q[layer]->z,
+                       1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_q[layer] : nullptr);
+        dequant_linear(k_buf, logits_out,
+                       weight->w_attn_k[layer]->w, weight->w_attn_k[layer]->s, weight->w_attn_k[layer]->z,
+                       1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_k[layer] : nullptr);
+        dequant_linear(v_buf, logits_out,
+                       weight->w_attn_v[layer]->w, weight->w_attn_v[layer]->s, weight->w_attn_v[layer]->z,
+                       1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_v[layer] : nullptr);
+        // rope
+        rope_v2(q_buf->view({ntok, nh, dh}), q_buf->view({ntok, nh, dh}), pos_ids_buf, weight->sin_table, weight->cos_table);
+        rope_v2(k_buf->view({ntok, nkvh, dh}), k_buf->view({ntok, nkvh, dh}), pos_ids_buf, weight->sin_table, weight->cos_table);
+        size_t token_offset = 0;
+        for (uint32_t req = 0; req < nreq; req++) {
+            auto past_len = req_pos[req];
+            auto seq_len = req_lens[req];
+            auto total_len = past_len + seq_len;
+            auto o = o_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
+            auto q = q_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
+            auto k = k_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, dh});
+            auto v = v_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, dh});
+
+            // self attention
+            // concat
+            rearrange(kv_caches[req]->k[idev][layer]->slice(0, past_len, seq_len), k);
+            rearrange(kv_caches[req]->v[idev][layer]->slice(0, past_len, seq_len), v);
+            // qk
+            rearrange(q_rearrange->slice(2, 0, seq_len), q);
+            auto qk_gemm = qk_buf->slice(1, 0, seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len});
+            auto k_gemm = kv_caches[req]->k[idev][layer]->slice(0, 0, total_len)->permute({1, 2, 0});
+            linear(qk_gemm, rearrange_q_buf->slice(1, 0, ngroup * seq_len), k_gemm, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr);
+            // softmax
+            auto qk_softmax = qk_buf->slice(1, 0, seq_len * total_len)->view({nh, seq_len, total_len});
+            causalSoftmax(qk_softmax, qk_softmax);
+            auto v_gemm = kv_caches[req]->v[idev][layer]->slice(0, 0, total_len)->permute({1, 0, 2});
+            linear(attn_val_buf->slice(1, 0, ngroup * seq_len), qk_gemm, v_gemm, 1.f, 0.f, nullptr, nullptr);
+            // rearrange attn val
+            rearrange(o, attn_val_gemm->slice(2, 0, seq_len));
+
+            token_offset += seq_len;
+        }
+        // o_proj
+        dequant_linear(logits_in, o_buf, weight->w_attn_out[layer]->w, weight->w_attn_out[layer]->s, weight->w_attn_out[layer]->z,
+                       1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
+        // All_reduce if distributed
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(
+                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+                INFINICCL_SUM, rsrc.comm, stream));
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+        }
+        // 2. FFN
+        rmsnorm(logits_out, logits_in, weight->w_ffn_norm[layer], meta->epsilon);
+        dequant_linear(gate_buf, logits_out,
+                       weight->w_ffn_gate[layer]->w, weight->w_ffn_gate[layer]->s, weight->w_ffn_gate[layer]->z,
+                       1.0, 0.0, nullptr, nullptr);
+        dequant_linear(up_buf, logits_out,
+                       weight->w_ffn_up[layer]->w, weight->w_ffn_up[layer]->s, weight->w_ffn_up[layer]->z,
+                       1.0, 0.0, nullptr, nullptr);
+        swiglu(gate_buf, up_buf, gate_buf);
+        dequant_linear(logits_in, gate_buf,
+                       weight->w_ffn_down[layer]->w, weight->w_ffn_down[layer]->s, weight->w_ffn_down[layer]->z,
+                       1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
+        // All_reduce if distributed
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(
+                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+                INFINICCL_SUM, rsrc.comm, stream));
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+        }
+    }
+    // Sample and Output
+    if (idev == 0) {
+        if (last_logits != nullptr) {
+            rmsnorm(logits_out, logits_in, weight->w_out_norm, meta->epsilon);
+            auto last_logits_buf = Tensor::buffer(dt_logits, {ntok, dvoc}, rsrc.memory_pool);
+            linear(last_logits_buf, logits_out, weight->w_out_embd, 1.0, 0.0, nullptr, nullptr);
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+            RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dt_logits) * ntok * dvoc, INFINIRT_MEMCPY_D2H));
+        }
+        if (output != nullptr) {
+            size_t token_offset = 0;
+            for (uint32_t req = 0; req < nreq; req++) {
+                auto seq_len = req_lens[req];
+                token_offset += seq_len;
+                rmsnorm(logits_out->slice(0, req, 1),
+                        logits_in->slice(0, token_offset - 1, 1),
+                        weight->w_out_norm,
+                        meta->epsilon);
+            }
+            linear(prob_buf, logits_out->slice(0, 0, nreq), weight->w_out_embd, 1.0, 0.0, nullptr, nullptr);
+            std::random_device _rd;
+            std::mt19937 gen(_rd());
+            token_offset = 0;
+            for (uint32_t req = 0; req < nreq; req++) {
+                auto seq_len = req_lens[req];
+                float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
+                randomSample(result_buf->slice(0, req, 1)->view_as({}, {}),
+                             prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}),
+                             random_val, topp[req], topk[req], temperature[req]);
+                token_offset += seq_len;
+            }
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+            RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(),
+                                      sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H));
+            for (uint32_t req = 0; req < nreq; req++) {
+                output[req] = uint32_t(result_cpu[req]);
+            }
+        }
+    }
+}
+
+__C void
+inferBatchJiugeAWQ(struct JiugeAWQModel *model,
+                   const uint32_t *tokens, uint32_t ntok,
+                   const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                   struct KVCache **kv_caches,
+                   const float *temperature, const uint32_t *topk, const float *topp,
+                   uint32_t *output) {
+    model->req.tokens = tokens;
+    model->req.ntok = ntok;
+    model->req.req_lens = req_lens;
+    model->req.nreq = nreq;
+    model->req.req_pos = req_pos;
+    model->req.kv_caches = kv_caches;
+    model->req.output = output;
+    model->req.logits = nullptr;
+    model->req.temperature = temperature;
+    model->req.topk = topk;
+    model->req.topp = topp;
+
+    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].proceed = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+    for (size_t i = model->dev_ids.size(); i > 0; i--) {
+        auto idev = i - 1;
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+        lock.unlock();
+    }
+}
+
+__C void
+forwardBatchJiugeAWQ(struct JiugeAWQModel *model,
+                     const uint32_t *tokens, uint32_t ntok,
+                     const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                     struct KVCache **kv_caches,
+                     void *logits) {
+    model->req.tokens = tokens;
+    model->req.ntok = ntok;
+    model->req.req_lens = req_lens;
+    model->req.nreq = nreq;
+    model->req.req_pos = req_pos;
+    model->req.kv_caches = kv_caches;
+    model->req.output = nullptr;
+    model->req.logits = logits;
+    model->req.temperature = nullptr;
+    model->req.topk = nullptr;
+    model->req.topp = nullptr;
+
+    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].proceed = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+    for (size_t i = model->dev_ids.size(); i > 0; i--) {
+        auto idev = i - 1;
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+        lock.unlock();
+    }
+}
+
+void launchDevice(const JiugeAWQMeta *meta, std::shared_ptr<JiugeAWQDeviceWeight> weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
+                  infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
+    // Create Device Resource
+    createDeviceResource(rsrc, meta, weights, device, idev, ndev, dev_id, comm);
+
+    CacheManager cache_manager(100);
+    InferenceContext ctx(rsrc->handle, rsrc->memory_pool, &cache_manager, rsrc->stream);
+
+    // Set the inference context for this thread
+    setInferenceContext(&ctx);
+
+    {
+        std::unique_lock<std::mutex> lock(state.mtx);
+        state.loaded = true;
+        lock.unlock();
+        state.cv_load.notify_one();
+    }
+
+    // Infer Loop
+    while (true) {
+        std::unique_lock<std::mutex> lock(state.mtx);
+        state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; });
+        // quit if exit_flag is set
+        if (state.exit_flag) {
+            break;
+        }
+
+        inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok,
+                         req.req_lens, req.nreq, req.req_pos, req.kv_caches,
+                         req.temperature, req.topk, req.topp, req.output, req.logits);
+
+        state.proceed = false;
+        lock.unlock();
+        state.cv_done.notify_one();
+    }
+
+    // Clean-Up
+    releaseDeviceResource(*rsrc);
+    setInferenceContext(nullptr); // Clear the context when done
+}
+
+JiugeAWQModel::JiugeAWQModel(const JiugeAWQMeta *meta, const ModelWeights *weights_) {
+    auto weights = (JiugeAWQWeights *)(weights_);
+    device = weights->device();
+    dev_ids = weights->dev_ids();
+    int ndev = int(dev_ids.size());
+    dev_resources = std::vector<DeviceResource>(ndev);
+    states = std::vector<InferState>(ndev);
+    threads.resize(ndev);
+
+    auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
+    if (ndev > 1) {
+        RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
+    }
+
+    for (int i = 0; i < ndev; i++) {
+        threads[i] = std::thread(launchDevice, meta, weights->device_weights()[i], &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
+    }
+    for (int i = 0; i < ndev; i++) {
+        std::unique_lock<std::mutex> lock(states[i].mtx);
+        states[i].cv_load.wait(lock, [&] { return states[i].loaded; });
+        lock.unlock();
+    }
+}
+
+__C struct JiugeAWQModel *
+createJiugeAWQModel(const JiugeAWQMeta *meta,
+                    const ModelWeights *weights) {
+    JiugeAWQModel *model = new JiugeAWQModel(meta, weights);
+    return model;
+}
+
+__C void destroyJiugeAWQModel(struct JiugeAWQModel *model) {
+    auto ndev = model->dev_resources.size();
+
+    for (size_t idev = 0; idev < ndev; idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].exit_flag = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+
+    for (size_t idev = 0; idev < ndev; idev++) {
+        model->threads[idev].join();
+    }
+
+    delete model;
+}
--- a/src/models/jiuge_awq/jiuge_awq.hpp
+++ b/src/models/jiuge_awq/jiuge_awq.hpp
+#pragma once
+#include "infinicore_infer/models/jiuge_awq.h"
+
+#include "../../cache.hpp"
+#include "../../dataloader/weights_loader.hpp"
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+struct QuantInt4Weight {
+    std::shared_ptr<Tensor> w, s, z;
+};
+
+struct JiugeAWQDeviceWeight {
+    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
+        cos_table;
+    std::vector<std::shared_ptr<Tensor>> w_attn_norm, b_attn_q, b_attn_k, b_attn_v, w_ffn_norm;
+    std::vector<std::shared_ptr<QuantInt4Weight>> w_attn_q, w_attn_k, w_attn_v, w_attn_out, w_ffn_gate, w_ffn_up, w_ffn_down;
+};
+
+class JiugeAWQWeights : public infinicore::WeightsLoader {
+private:
+    std::vector<std::shared_ptr<JiugeAWQDeviceWeight>> _device_weights;
+
+public:
+    JiugeAWQWeights(const JiugeAWQMeta *meta,
+                    infiniDevice_t device,
+                    const std::vector<int> &dev_ids);
+    std::vector<std::shared_ptr<JiugeAWQDeviceWeight>> &device_weights() {
+        return _device_weights;
+    }
+};
+
+struct DeviceResource {
+    // Device
+    infiniDevice_t device;
+    int device_id;
+    infiniopHandle_t handle;
+    // Weights
+    std::shared_ptr<JiugeAWQDeviceWeight> weights;
+    // Streams
+    infinirtStream_t stream;
+    // Communicator
+    infinicclComm_t comm;
+
+    std::shared_ptr<MemoryPool> memory_pool;
+};
+
+struct InferRequest {
+    const uint32_t *tokens;
+    uint32_t ntok;
+    const uint32_t *req_lens;
+    uint32_t nreq;
+    const uint32_t *req_pos;
+    struct KVCache **kv_caches;
+    const float *temperature;
+    const uint32_t *topk;
+    const float *topp;
+    uint32_t *output;
+    void *logits;
+};
+
+struct InferState {
+    std::mutex mtx;
+    std::condition_variable cv_load, cv_start, cv_done;
+    bool loaded = false;
+    bool proceed = false;
+    bool exit_flag = false;
+};
+
+struct JiugeAWQModel {
+    JiugeAWQMeta meta;
+    infiniDevice_t device;
+    std::vector<int> dev_ids;
+    std::vector<DeviceResource> dev_resources;
+    std::vector<InferState> states;
+    std::vector<std::thread> threads;
+    InferRequest req;
+
+    JiugeAWQModel(const JiugeAWQMeta *, const ModelWeights *);
+};
\ No newline at end of file
--- a/src/models/jiuge_awq/jiuge_awq_weight.cpp
+++ b/src/models/jiuge_awq/jiuge_awq_weight.cpp
+#include "jiuge_awq.hpp"
+
+#include <cmath>
+
+inline std::shared_ptr<Tensor> getSinTable(size_t dctx, size_t dh, float theta) {
+    auto half_dh = dh / 2;
+    auto unit = dsize(INFINI_DTYPE_F16);
+    void *table = std::malloc(dctx * half_dh * unit);
+
+    for (size_t i = 0; i < dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float _sin = std::sin(
+                static_cast<float>(i) / std::pow(theta, static_cast<float>(j) / half_dh));
+
+            ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin);
+        }
+    }
+    auto shape = std::vector<size_t>({dctx, half_dh});
+    auto tensor = Tensor::weight(table, INFINI_DTYPE_F16, shape);
+    std::free(table);
+    return tensor;
+}
+
+inline std::shared_ptr<Tensor> getCosTable(size_t dctx, size_t dh, float theta) {
+    auto half_dh = dh / 2;
+    auto unit = dsize(INFINI_DTYPE_F16);
+    void *table = std::malloc(dctx * half_dh * unit);
+
+    for (size_t i = 0; i < dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float _cos = std::cos(
+                static_cast<float>(i) / std::pow(theta, static_cast<float>(j) / half_dh));
+
+            ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos);
+        }
+    }
+    auto shape = std::vector<size_t>({dctx, half_dh});
+    auto tensor = Tensor::weight(table, INFINI_DTYPE_F16, shape);
+    std::free(table);
+    return tensor;
+}
+
+JiugeAWQWeights::JiugeAWQWeights(
+    const JiugeAWQMeta *meta,
+    infiniDevice_t device,
+    const std::vector<int> &dev_ids) : infinicore::WeightsLoader(device, dev_ids) {
+    auto ndev = dev_ids.size();
+    _device_weights.resize(ndev);
+    infiniDtype_t dt_logits = meta->dt_logits;
+    infiniDtype_t dt_norm_w = meta->dt_norm_w;
+    size_t nlayer = meta->nlayer;
+    size_t d = meta->d;
+    size_t nh = meta->nh / ndev;
+    size_t nkvh = meta->nkvh / ndev;
+    size_t dh = meta->dh;
+    size_t di = meta->di / ndev;
+    size_t dctx = meta->dctx;
+    size_t dvoc = meta->dvoc;
+    size_t nbit = meta->nbit;
+    size_t quant_group_size = meta->quant_group_size;
+
+    for (size_t i = 0; i < ndev; i++) {
+        RUN_INFINI(infinirtSetDevice(device, dev_ids[i]));
+
+        auto weight = std::make_shared<JiugeAWQDeviceWeight>();
+        _device_weights[i] = weight;
+
+        auto w_in_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d});
+        this->resigter("model.embed_tokens.weight", w_in_embd, i);
+        weight->w_in_embd = w_in_embd;
+
+        auto w_out_norm = Tensor::weight(nullptr, dt_norm_w, {d});
+        this->resigter("model.norm.weight", w_out_norm, i);
+        weight->w_out_norm = w_out_norm;
+
+        auto w_out_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d})->permute({1, 0});
+        this->resigter("lm_head.weight", w_out_embd, i);
+        weight->w_out_embd = w_out_embd;
+
+        weight->sin_table = getSinTable(dctx, dh, meta->theta);
+        weight->cos_table = getCosTable(dctx, dh, meta->theta);
+
+        for (size_t layer = 0; layer < nlayer; layer++) {
+
+#define RIGISTER_LAYER_WEIGHT(W_NAME, W_VAR, W_SHAPE, W_DTYPE) \
+    auto W_VAR = Tensor::weight(nullptr, W_DTYPE, W_SHAPE);    \
+    this->resigter(W_NAME, W_VAR, i);                          \
+    weight->W_VAR.push_back(W_VAR);
+
+            RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".input_layernorm.weight", w_attn_norm, {d}, dt_norm_w);
+
+#define REGISTER_LAYER_QUANT_WEIGHT(W_NAME, W_VAR, W_IN, W_OUT)                                           \
+    auto W_VAR = std::make_shared<QuantInt4Weight>();                                                     \
+    W_VAR->w = Tensor::weight(nullptr, INFINI_DTYPE_I32, {W_IN, (W_OUT)*nbit / 32});                      \
+    this->resigter(W_NAME + ".qweight", W_VAR->w, i);                                                     \
+    W_VAR->s = Tensor::weight(nullptr, INFINI_DTYPE_F16, {(W_IN) / quant_group_size, (W_OUT)});           \
+    this->resigter(W_NAME + ".scales", W_VAR->s, i);                                                      \
+    W_VAR->z = Tensor::weight(nullptr, INFINI_DTYPE_I32, {(W_IN) / quant_group_size, (W_OUT)*nbit / 32}); \
+    this->resigter(W_NAME + ".qzeros", W_VAR->z, i);                                                      \
+    weight->W_VAR.push_back(W_VAR);
+
+            REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj", w_attn_q, d, nh * dh);
+            REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj", w_attn_k, d, nkvh * dh);
+            REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj", w_attn_v, d, nkvh * dh);
+            RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj.bias", b_attn_q, {nh * dh}, INFINI_DTYPE_F16);
+            RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj.bias", b_attn_k, {nkvh * dh}, INFINI_DTYPE_F16);
+            RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj.bias", b_attn_v, {nkvh * dh}, INFINI_DTYPE_F16);
+            REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.o_proj", w_attn_out, nh * dh, d);
+
+            RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".post_attention_layernorm.weight", w_ffn_norm, {d}, dt_norm_w);
+            REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.gate_proj", w_ffn_gate, d, di);
+            REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.up_proj", w_ffn_up, d, di);
+            REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.down_proj", w_ffn_down, di, d);
+        }
+    }
+
+#undef RIGISTER_LAYER_WEIGHT
+#undef REGISTER_LAYER_QUANT_WEIGHT
+}
+
+__C struct ModelWeights *
+createJiugeAWQWeights(const JiugeAWQMeta *meta,
+                      infiniDevice_t device,
+                      int ndev,
+                      const int *dev_ids) {
+    JiugeAWQWeights *weights = new JiugeAWQWeights(meta, device, std::vector<int>(dev_ids, dev_ids + ndev));
+    return (struct ModelWeights *)weights;
+}
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@@ -2,7 +2,6 @@
 #define INFER_TENSOR_H

 #include "allocator.hpp"
-#include "infinicore_infer.h"
 #include "utils.hpp"
 #include <memory>
 #include <string>
@@ -101,6 +100,7 @@ public:
    static std::shared_ptr<Tensor> weight(void *host_data,
                                          infiniDtype_t dtype,
                                          const std::vector<size_t> &shape);
+    void load(const void *host_data, infinirtStream_t stream = nullptr);
    std::shared_ptr<Tensor> memShare(const std::vector<size_t> &shape,
                                     infiniDtype_t dtype = INFINI_DTYPE_INVALID) const;
    std::shared_ptr<Tensor> slice(size_t dim, size_t start, size_t len);
@@ -126,6 +126,7 @@ public:
    ptrdiff_t dataOffset() const;
    infiniDevice_t deviceType() const;
    int deviceId() const;
+    size_t numel() const;

    void debug(const std::string &filename) const;
    void debug() const;

--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -113,6 +113,10 @@ infiniDevice_t Tensor::deviceType() const { return this->_storage->deviceType();
 int Tensor::deviceId() const { return this->_storage->deviceId(); }
 Tensor::~Tensor() {}

+size_t Tensor::numel() const {
+    return std::accumulate(this->shape().begin(), this->shape().end(), size_t(1), std::multiplies<size_t>());
+}
+
 ptrdiff_t Tensor::dataOffset() const {
    return _offset;
 }
@@ -154,16 +158,26 @@ std::shared_ptr<Tensor> Tensor::weight(void *data, infiniDtype_t dtype,

    tensor->_storage = Storage::create(size);
    tensor->_desc = TensorDesc::create(dtype, shape, strides);
+    if (data != nullptr) {
+        tensor->load(data);
+    }
+
+    tensor->_offset = 0;
+    return tensor;
+}
+
+void Tensor::load(const void *data, infinirtStream_t stream) {
+    if (stream) {
+        RUN_INFINI(infinirtMemcpyAsync(this->_storage->memory(), data, this->_storage->size(), INFINIRT_MEMCPY_H2D, stream));
+        return;
+    }
    // NOTE: 为兼容部分平台（沐曦）多线程并发对同一host数据执行memcpy卡死问题
    static std::mutex mutex;
    {
        std::lock_guard<std::mutex> lock(mutex);
-        RUN_INFINI(infinirtMemcpy(tensor->_storage->memory(),
-                                  data, size, INFINIRT_MEMCPY_H2D));
+        RUN_INFINI(infinirtMemcpy(this->_storage->memory(),
+                                  data, this->_storage->size(), INFINIRT_MEMCPY_H2D));
    }
-
-    tensor->_offset = 0;
-    return tensor;
 }

 std::shared_ptr<Tensor> Tensor::memShare(const std::vector<size_t> &shape, infiniDtype_t dtype_) const {

--- a/xmake.lua
+++ b/xmake.lua
@@ -16,6 +16,8 @@ target("infinicore_infer")
    add_files("src/models/*/*.cpp")
    add_files("src/tensor/*.cpp")
    add_files("src/allocator/*.cpp")
+    add_files("src/dataloader/*.cpp")
+    add_files("src/cache_manager/*.cpp")
    add_includedirs("include")

    set_installdir(INFINI_ROOT)