适配dcu

098fab15 · yaoht · cfe4b1a8 · 098fab15 · 098fab15 · 098fab15
Commit 098fab15 authored Apr 21, 2026 by yaoht
20 changed files
--- a/csrc/engine/compiler/paged_compiler.cpp
+++ b/csrc/engine/compiler/paged_compiler.cpp
 #include "paged_compiler.hpp"

+#include <spdlog/spdlog.h>
+
 namespace {
 // Todo: replace with Tensor::zeros when it is available
 inline void set_zeros(infinicore::Tensor &tensor) {
@@ -31,6 +33,11 @@ PagedCompiler::PagedCompiler(const std::shared_ptr<InfinilmModel> &model, RankBa
 void PagedCompiler::compile() {
    if (model_->get_cache_config() != nullptr && dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())) {
        size_t nblocks = dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())->num_blocks();
+
+        // ///////////
+        // 获取配置中的 block 大小（比如 16 或 32）
+        int block_size = dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())->block_size();
+        /////////
        size_t max_batch_size = *std::max_element(decode_batch_sizes_.begin(), decode_batch_sizes_.end());
        compiled_map_decode_.clear();
        block_tables_holder_ = infinicore::Tensor::empty(
@@ -59,6 +66,15 @@ void PagedCompiler::compile() {
            input.slot_mapping = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
            set_zeros(input.slot_mapping.value());

+            ////////////////
+            // 从当前 dummy tensor 的 shape 中直接提取物理极限
+            // 1. 对于 varlen，q 的最大长度就是 input_ids 的 token 总数 (这里是 1*b / b)
+            // 如果你的 prefill 循环里 input_ids 分配的是 {seq_len, b}，这行代码依然自适应生效
+            input.max_seqlen_q = input.input_ids.value()->size(0); // 假设 shape 里的维 0 是 seq_len，维 1 是 batch
+
+            // 2. max_seqlen_k 的绝对安全边界 = 当前分配的每请求 block 数 * 每个 block 的容量
+            input.max_seqlen_k = block_per_req * block_size;
+            /////////////
            barrier_->wait();
            infinicore::context::startGraphRecording();
            auto output = model_->forward(input);

--- a/csrc/engine/compiler/static_batching_compiler.cpp
+++ b/csrc/engine/compiler/static_batching_compiler.cpp
 #include "static_batching_compiler.hpp"

 #include "../../cache/cache.hpp"
+#include <spdlog/spdlog.h>

 namespace infinilm::engine {
 StaticBatchingCompiler::StaticBatchingCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)

--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
 #include "infer_engine.hpp"
+#include "../cache/cache.hpp"
 #include "spdlog/spdlog.h"

+#include <algorithm>
+
 namespace infinilm::engine {

 //------------------------------------------------------
@@ -109,14 +112,14 @@ std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> InferEng
 // forward
 //------------------------------------------------------
 infinilm::InfinilmModel::Input
-InferEngine::Input::to_model_input(infinicore::Device device) const {
+InferEngine::Input::to_model_input(infinicore::Device device, const cache::CacheConfig *cache_config) const {

    auto to_device = [&](const std::optional<infinicore::Tensor> &t)
        -> std::optional<infinicore::Tensor> {
        return t.has_value() ? t.value()->to(device) : t;
    };

-    return {
+    InfinilmModel::Input out{
        to_device(input_ids), // @todo: on device in the future
        to_device(position_ids),
        to_device(past_sequence_lengths), // @todo: on device in the future
@@ -126,6 +129,27 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
        to_device(block_tables),
        to_device(slot_mapping),
    };
+
+    // Paged prefill FA: bounds without reading GPU tensor payloads (shape / cache config only).
+    if (cache_config != nullptr) {
+        if (const auto *paged = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
+            if (out.block_tables.has_value()) {
+                out.max_seqlen_k = static_cast<int>(out.block_tables.value()->size(1) * paged->block_size());
+            }
+            if (out.slot_mapping.has_value()) {
+                out.max_seqlen_q = static_cast<int>(out.slot_mapping.value()->size(0));
+            } else if (out.input_ids.has_value()) {
+                const auto &sh = out.input_ids.value()->shape();
+                if (sh.size() == 1) {
+                    out.max_seqlen_q = static_cast<int>(sh[0]);
+                } else if (sh.size() >= 2) {
+                    out.max_seqlen_q = static_cast<int>(std::max(sh[0], sh[1]));
+                }
+            }
+        }
+    }
+
+    return out;
 }

 InferEngine::Output InferEngine::forward(const InferEngine::Input &input) {

--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -47,7 +47,11 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,

    // Wait until the worker thread finishes initialization (model created)
    std::unique_lock<std::mutex> lk(mutex_);
-    cv_.wait(lk, [&] { return init_done_; });
+    cv_.wait(lk, [&] { return init_done_ || should_exit_; });
+    if (should_exit_) {
+        thread_.join();
+        throw std::runtime_error("RankWorker initialization failed (see error above)");
+    }
 }

 RankWorker::RankWorker(
@@ -75,7 +79,11 @@ RankWorker::RankWorker(
    thread_ = std::thread(&RankWorker::thread_loop, this);
    // Wait until the worker thread finishes initialization (model created)
    std::unique_lock<std::mutex> lk(mutex_);
-    cv_.wait(lk, [&] { return init_done_; });
+    cv_.wait(lk, [&] { return init_done_ || should_exit_; });
+    if (should_exit_) {
+        thread_.join();
+        throw std::runtime_error("RankWorker initialization failed (see error above)");
+    }
 }

 std::string RankWorker::info() const {
@@ -327,7 +335,8 @@ void RankWorker::thread_loop() {
                        infinicore::Tensor logits;
                        // Try to get compiled graph
                        if (compiler_ != nullptr) {
-                            auto [graph, output] = compiler_->get_compiled(local_args.to_model_input(infinicore::Device::cpu()));
+                            auto [graph, output] = compiler_->get_compiled(
+                                local_args.to_model_input(infinicore::Device::cpu(), model_->get_cache_config()));
                            if (graph != nullptr && output != nullptr) {
                                graph->run();
                                logits = output->logits;
@@ -335,7 +344,7 @@ void RankWorker::thread_loop() {
                        }
                        // Fall back to eager mode
                        if (!logits) {
-                            auto model_args = local_args.to_model_input(rank_info_.device);
+                            auto model_args = local_args.to_model_input(rank_info_.device, model_->get_cache_config());
                            logits = model_->forward(model_args).logits;
                        }

@@ -436,10 +445,12 @@ void RankWorker::thread_loop() {
        compiler_.reset();
    } catch (const std::exception &e) {
        // Top-level exception: ensure any waiters are woken and the thread exits cleanly.
+        // Must set init_done_=true so constructor's cv_.wait(init_done_) can unblock.
        {
            std::lock_guard<std::mutex> lk(mutex_);
            should_exit_ = true;
            job_done_ = true;
+            init_done_ = true;
        }
        cv_.notify_all();
        spdlog::error("[{}] fatal exception in thread_loop: {} \n", info(), e.what());

--- a/csrc/engine/rank_worker.hpp
+++ b/csrc/engine/rank_worker.hpp
@@ -53,7 +53,9 @@ public:

        float top_p{1};

-        infinilm::InfinilmModel::Input to_model_input(infinicore::Device device) const;
+        /// Fills max_seqlen_q/k for paged FA prefill using tensor shapes + cache config only (no tensor D2H).
+        infinilm::InfinilmModel::Input to_model_input(infinicore::Device device,
+                                                      const cache::CacheConfig *cache_config = nullptr) const;
    };

    struct Output {

--- a/csrc/models/infinilm_model.hpp
+++ b/csrc/models/infinilm_model.hpp
@@ -33,6 +33,9 @@ public:
        std::optional<infinicore::Tensor> block_tables;
        /// Slot ids for each token `[seq]`. Used for paged cache.
        std::optional<infinicore::Tensor> slot_mapping;
+        // 【新增】由上层引擎在构造 Tensor 时传入
+        int max_seqlen_q = 0; 
+        int max_seqlen_k = 0;
    };

    struct Output {

--- a/csrc/models/llama/llama_attention.cpp
+++ b/csrc/models/llama/llama_attention.cpp
@@ -181,6 +181,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta

    // 4. Apply RoPE to Q and K
    auto q_rope = infinicore::Tensor::empty({batch_size, num_attention_heads_, seq_len, head_dim_}, q_reshaped->dtype(), q_reshaped->device())->permute({0, 2, 1, 3});
+    
    rotary_emb_->forward(q_rope, q_reshaped, pos_ids_for_rope); // [bs, seq_len, n_q_head, head_dim]
    rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true);   // [bs, seq_len, n_kv_head, head_dim]

@@ -248,7 +249,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
                                                  std::optional<infinicore::Tensor> input_offsets,
                                                  std::optional<infinicore::Tensor> cu_seqlens,
                                                  std::optional<infinicore::Tensor> block_tables,
-                                                  std::optional<infinicore::Tensor> slot_mapping) const {
+                                                  std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const {
    ASSERT(block_tables.has_value());
    ASSERT(slot_mapping.has_value());

@@ -305,9 +306,25 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd

    // 6. Compute attention
    infinicore::Tensor attn_output = infinicore::Tensor::empty({seq_len, num_attention_heads_, head_dim_}, q_reshaped->dtype(), q_reshaped->device());
-
+    
    if (is_prefill) {
        if (attention_backend_ == backends::AttentionBackend::FlashAttn) {
+            // Compute actual max sequence lengths from the cumulative seqlen tensors.
+            // Passing max_position_embeddings_ here causes flash-attn's splitkv kernel to
+            // compute far too many K-block iterations, reading block_table entries that do
+            // not exist and then using the garbage values as KV-cache block indices,
+            // resulting in an out-of-bounds GPU memory access (VMFault).
+            //////////////
+            // auto total_lens_cpu = total_sequence_lengths.value()->to(infinicore::Device::cpu());
+            // const auto *total_lens_ptr = reinterpret_cast<const int32_t *>(total_lens_cpu->data());
+            // int n_reqs = static_cast<int>(total_sequence_lengths.value()->shape()[0]);
+            // int max_seqlen_k = 0;
+            // for (int i = 0; i < n_reqs; ++i) {
+            //     max_seqlen_k = std::max(max_seqlen_k, total_lens_ptr[i]);
+            // }
+            // // max_seqlen_q: with batch_size==1 the flattened seq_len equals the per-request length.
+            // int max_seqlen_q = static_cast<int>(seq_len);
+
            infinicore::op::mha_varlen_(
                attn_output,
                q_reshaped,
@@ -316,8 +333,8 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
                input_offsets.value(),
                cu_seqlens.value(),
                block_tables.value(),
-                max_position_embeddings_,
-                max_position_embeddings_,
+                static_cast<int>(seq_len),
+                max_seqlen_k,
                std::nullopt,
                scaling_);
        } else {
@@ -377,16 +394,15 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
                                           std::optional<infinicore::Tensor> input_offsets,
                                           std::optional<infinicore::Tensor> cu_seqlens,
                                           std::optional<infinicore::Tensor> block_tables,
-                                           std::optional<infinicore::Tensor> slot_mapping) const {
+                                           std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const {
    if (!rotary_emb_) {
        throw std::runtime_error("LlamaAttention: rotary_emb not configured");
    }

    infinicore::Tensor output;
    if (auto paged_kv_cache = std::dynamic_pointer_cast<cache::PagedKVCache>(kv_cache)) {
-        output = forward_paged_(hidden_states, position_ids, paged_kv_cache, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
+        output = forward_paged_(hidden_states, position_ids, paged_kv_cache, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping,max_seqlen_q, max_seqlen_k);
    } else {
-
        output = forward_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths);
    }
    return output;

--- a/csrc/models/llama/llama_attention.hpp
+++ b/csrc/models/llama/llama_attention.hpp
@@ -78,7 +78,7 @@ public:
                               std::optional<infinicore::Tensor> input_offsets,
                               std::optional<infinicore::Tensor> cu_seqlens,
                               std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mapping) const;
+                               std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const;

    /**
     * @brief Get the layer index
@@ -110,7 +110,7 @@ private:
                                      std::optional<infinicore::Tensor> input_offsets,
                                      std::optional<infinicore::Tensor> cu_seqlens,
                                      std::optional<infinicore::Tensor> block_tables,
-                                      std::optional<infinicore::Tensor> slot_mapping) const;
+                                      std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const;

 protected:
    // Projection layers

--- a/csrc/models/llama/llama_decoder_layer.cpp
+++ b/csrc/models/llama/llama_decoder_layer.cpp
@@ -61,13 +61,13 @@ LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
                           std::optional<infinicore::Tensor> input_offsets,
                           std::optional<infinicore::Tensor> cu_seqlens,
                           std::optional<infinicore::Tensor> block_tables,
-                           std::optional<infinicore::Tensor> slot_mapping) const {
+                           std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const {
    // 1. Attention layer normalization
    input_layernorm_->forward_inplace(hidden_states, residual);

    // 2. Self-attention
    hidden_states = self_attn_->forward(
-        hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
+        hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping,max_seqlen_q, max_seqlen_k);

    // 3. Post-attention layer normalization
    post_attention_layernorm_->forward_inplace(hidden_states, residual);

--- a/csrc/models/llama/llama_decoder_layer.hpp
+++ b/csrc/models/llama/llama_decoder_layer.hpp
@@ -77,7 +77,7 @@ public:
            std::optional<infinicore::Tensor> input_offsets,
            std::optional<infinicore::Tensor> cu_seqlens,
            std::optional<infinicore::Tensor> block_tables,
-            std::optional<infinicore::Tensor> slot_mappin) const;
+            std::optional<infinicore::Tensor> slot_mappin,int max_seqlen_q, int max_seqlen_k) const;

    /**
     * @brief Get the layer index

--- a/csrc/models/llama/llama_for_causal_lm.cpp
+++ b/csrc/models/llama/llama_for_causal_lm.cpp
@@ -61,10 +61,13 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
    auto cu_seqlens = input.cu_seqlens;
    auto block_tables = input.block_tables;
    auto slot_mapping = input.slot_mapping;
-
+    /////////////////////////////////
+    int max_seqlen_q = input.max_seqlen_q; // 拿到！
+    int max_seqlen_k = input.max_seqlen_k; // 拿到！
+    //////////////////////////
    // 1. Forward through base model to get hidden states
    auto hidden_states = model_->forward(
-        input_ids, position_ids, past_sequence_lengths, total_sequence_length, input_offsets, cu_seqlens, block_tables, slot_mapping);
+        input_ids, position_ids, past_sequence_lengths, total_sequence_length, input_offsets, cu_seqlens, block_tables, slot_mapping,max_seqlen_q, max_seqlen_k);

    // 2. Apply language modeling head to get logits
    auto logits = lm_head_->forward(hidden_states);

--- a/csrc/models/llama/llama_model.cpp
+++ b/csrc/models/llama/llama_model.cpp
@@ -96,7 +96,8 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
                                       std::optional<infinicore::Tensor> input_offsets,
                                       std::optional<infinicore::Tensor> cu_seqlens,
                                       std::optional<infinicore::Tensor> block_tables,
-                                       std::optional<infinicore::Tensor> slot_mapping) const {
+                                       std::optional<infinicore::Tensor> slot_mapping,
+                                        int max_seqlen_q, int max_seqlen_k) const {
    // 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
    auto hidden_states = embed_tokens_->forward(input_ids);

@@ -114,7 +115,7 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
            input_offsets,
            cu_seqlens,
            block_tables,
-            slot_mapping);
+            slot_mapping,max_seqlen_q, max_seqlen_k);
    }

    norm_->forward_inplace(hidden_states, residual);

--- a/csrc/models/llama/llama_model.hpp
+++ b/csrc/models/llama/llama_model.hpp
@@ -77,7 +77,7 @@ public:
                               std::optional<infinicore::Tensor> input_offsets,
                               std::optional<infinicore::Tensor> cu_seqlens,
                               std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mapping) const;
+                               std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const;

    void reset_cache(const cache::CacheConfig *cache_config);


--- a/csrc/pybind11/bindings.cc
+++ b/csrc/pybind11/bindings.cc
@@ -9,6 +9,11 @@ namespace py = pybind11;
 PYBIND11_MODULE(_infinilm, m) {
    m.doc() = "InfiniLM Llama model Python bindings";

+    // Import _infinicore first so that its pybind11 types
+    // (DataType, Device::Type, etc.) are registered in the
+    // global type map before _infinilm tries to use them.
+    py::module_::import("infinicore");
+
    infinilm::cache::bind_cache(m);
    infinilm::models::llama::bind_llama(m);
    infinilm::engine::distributed::bind_dist_config(m);

--- a/csrc/pybind11/engine/engine.hpp
+++ b/csrc/pybind11/engine/engine.hpp
@@ -34,14 +34,14 @@ inline void bind_infer_engine(py::module &m) {
        .def(py::init([](
                          const InfinilmModel::Config &cfg,
                          const distributed::DistConfig &dist,
-                          infinicore::Device::Type dev,
+                          infinicore::Device::Type device_type,
                          std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
                          bool enable_graph_compiling,
                          const std::string &attention_backend) {
                 return std::make_shared<InferEngine>(
                     cfg,
                     dist,
-                     dev,
+                     device_type,
                     cache_cfg ? cache_cfg.get() : nullptr,
                     enable_graph_compiling,
                     infinilm::backends::parse_attention_backend(attention_backend));
@@ -80,14 +80,14 @@ inline void bind_infer_engine(py::module &m) {
        .def(py::init([](
                          const std::string &model_path,
                          const distributed::DistConfig &dist,
-                          infinicore::Device::Type dev,
+                          infinicore::Device::Type device_type,
                          std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
                          bool enable_graph_compiling,
                          const std::string &attention_backend) {
                 return std::make_shared<InferEngine>(
                     model_path,
                     dist,
-                     dev,
+                     device_type,
                     cache_cfg ? cache_cfg.get() : nullptr,
                     enable_graph_compiling,
                     infinilm::backends::parse_attention_backend(attention_backend));

--- a/csrc/pybind11/models/llama.hpp
+++ b/csrc/pybind11/models/llama.hpp
@@ -46,7 +46,6 @@ inline void bind_llama(py::module &m) {
    py::class_<LlamaConfig, InfinilmModel::Config> llama_config(m, "LlamaConfig");
    llama_config
        .def(py::init<>())
-        // TODO: Change this to `dtype` after updating InfiniCore pybind11 exposing mechanism.
        .def_readwrite("_dtype", &LlamaConfig::dtype)
        .def_readwrite("vocab_size", &LlamaConfig::vocab_size)
        .def_readwrite("hidden_size", &LlamaConfig::hidden_size)

--- a/examples/bench.py
+++ b/examples/bench.py
@@ -360,9 +360,16 @@ class TestModel:
        #                        自回归生成
        # ---------------------------------------------------------------------------- #
        input_ids_infini = infinicore.from_list(input_ids_list)
-
+        import os
+        import ctypes
+        rocm_path = os.environ.get('ROCM_PATH')
+        lib_path = os.path.join(rocm_path, 'hip/lib/libgalaxyhip.so')
+        lib = ctypes.CDLL(lib_path)
+        roctracer_stop = lib.roctracer_stop
+        roctracer_start = lib.roctracer_start
        t1 = time.time()
        print("=================== start generate ====================")
+        roctracer_start()
        output_ids = self.model.generate(
            input_ids_infini,
            GenerationConfig(
@@ -375,6 +382,7 @@ class TestModel:
            ),
            _measure_and_log_time=True,
        )
+        roctracer_stop()
        t2 = time.time()

        numpy_output_ids = np.array(

--- a/examples/bench_prof.py
+++ b/examples/bench_prof.py
+import infinicore
+from transformers import AutoTokenizer
+from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.distributed import DistConfig
+from infinilm.infer_engine import GenerationConfig, InferEngine
+from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
+import argparse
+import sys
+import time
+import os
+import json
+from collections import OrderedDict
+import numpy as np
+from tqdm import tqdm
+import torch
+from torch.profiler import ProfilerActivity
+from torch.profiler import profile as torch_prof
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
+
+
+DATA_TYPE_BYTES = {
+    "bfloat16": 2,
+    "float16": 2,
+    "float32": 4,
+}
+
+_PAGED_KV_BLOCK_SIZE = 256
+
+# BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128]
+# INPUT_LENS = [32, 256, 1024, 4096]
+# OUTPUT_LENS = [256, 1024, 4096]
+
+
+def read_json_file(file_path):
+    """Load and return JSON content from file_path."""
+    with open(file_path, "r") as file:
+        return json.load(file)
+
+
+def parse_list(value: str):
+    """Parse parse_list argument: can be a single int or a list of ints.
+
+    Examples:
+        "1" -> 1
+        "[1,2,4]" -> [1, 2, 4]
+        "1,2,4" -> [1, 2, 4]
+    """
+    value = value.strip()
+    # Try to parse as JSON list first
+    if value.startswith("[") and value.endswith("]"):
+        try:
+            result = json.loads(value)
+            if isinstance(result, list):
+                return [int(x) for x in result]
+            return int(result)
+        except (json.JSONDecodeError, ValueError):
+            pass
+
+    # Try to parse as comma-separated values
+    if "," in value:
+        try:
+            return [int(x.strip()) for x in value.split(",")]
+        except ValueError:
+            pass
+
+    # Try to parse as a single integer
+    try:
+        return int(value)
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            f"batch-size must be an int or list[int], got: {value}"
+        )
+
+
+def get_test_cases(
+    model_path: str,
+    batch_size_list: list[int],
+    input_len_list: list[int],
+    output_len_list: list[int],
+):
+    model_path = os.path.expanduser(model_path)
+
+    """Generate cases ordered by ascending KV cache memory usage."""
+    # Load model config to derive attention dimensions
+    config = read_json_file(os.path.join(model_path, "config.json"))
+    head_dim = config.get(
+        "head_dim", config.get("hidden_size") // config.get("num_attention_heads")
+    )
+    # KV heads and layers drive cache size
+    num_key_value_heads = config.get("num_key_value_heads")
+    num_hidden_layers = config.get("num_hidden_layers")
+
+    # Enumerate all batch/input/output combinations and compute KV cache size
+    case_list = []
+    for batch_size in batch_size_list:
+        for input_len in input_len_list:
+            for output_len in output_len_list:
+                for data_type in ["bfloat16"]:
+                    data_type_bytes = DATA_TYPE_BYTES[data_type]
+
+                    total_seq_len = input_len + output_len
+                    kvcache_memory_bytes = (
+                        data_type_bytes
+                        * (batch_size * total_seq_len * num_key_value_heads * head_dim)
+                        * num_hidden_layers
+                    )
+                    kvcache_memory_gb = kvcache_memory_bytes / (1024 * 1024 * 1024)
+
+                    case_list.append(
+                        {
+                            "idx": len(case_list),
+                            "batch_size": batch_size,
+                            "input_len": input_len,
+                            "output_len": output_len,
+                            "data_type": data_type,
+                            "kvcache_memory": round(kvcache_memory_gb, 3),
+                        }
+                    )
+
+    # Sort by KV cache size and wrap in OrderedDict with index keys
+    case_dict = OrderedDict(
+        (idx, case)
+        for idx, case in enumerate(
+            sorted(case_list, key=lambda case: case["kvcache_memory"])
+        )
+    )
+
+    return case_dict
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="run Llama args")
+
+    parser.add_argument(
+        "--cpu",
+        action="store_true",
+        help="Run cpu test",
+    )
+    parser.add_argument(
+        "--nvidia",
+        action="store_true",
+        help="Run nvidia test",
+    )
+    parser.add_argument(
+        "--qy",
+        action="store_true",
+        help="Run qy test",
+    )
+    parser.add_argument(
+        "--metax",
+        action="store_true",
+        help="Run metax test",
+    )
+    parser.add_argument(
+        "--moore",
+        action="store_true",
+        help="Run moore test",
+    )
+    parser.add_argument(
+        "--iluvatar",
+        action="store_true",
+        help="Run iluvatar test",
+    )
+    parser.add_argument(
+        "--cambricon",
+        action="store_true",
+        help="Run cambricon test",
+    )
+    parser.add_argument(
+        "--ali",
+        action="store_true",
+        help="Run alippu test",
+    )
+    parser.add_argument(
+        "--hygon",
+        action="store_true",
+        help="Run hygon test",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="model path",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=parse_list,
+        default=1,
+        help="number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')",
+    )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "--tp",
+        type=int,
+        default=1,
+        help="total rank for tensor parallel",
+    )
+    parser.add_argument(
+        "--input-len",
+        type=parse_list,
+        default=10,
+        help="output tokens",
+    )
+
+    parser.add_argument(
+        "--output-len",
+        type=parse_list,
+        default=20,
+        help="output tokens",
+    )
+    parser.add_argument(
+        "--skip-load",
+        action="store_true",
+        help="skip loading model weights",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=1,
+        help="top k sampling",
+    )
+
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        default=1.0,
+        help="top p sampling",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="sampling temperature",
+    )
+    parser.add_argument(
+        "--enable-paged-attn",
+        action="store_true",
+        help="use paged cache",
+    )
+    parser.add_argument(
+        "--paged_kv_block_size",
+        type=int,
+        default=256,
+        help="num tokens each kv block can hold",
+    )
+    parser.add_argument(
+        "--enable-graph",
+        action="store_true",
+        help="enable graph compiling",
+    )
+    parser.add_argument(
+        "--warmup",
+        action="store_true",
+        help="Perform a warmup run before benchmarking/inference.",
+    )
+    parser.add_argument(
+        "--attn",
+        type=str,
+        default="default",
+        choices=["default", "flash-attn"],
+        help="attention backend to use: 'default' or 'flash-attn'",
+    )
+    return parser.parse_args()
+
+
+with open("examples/bench_prompt.md", "r") as f:
+    prompt = f.read()
+
+
+def repeat_prompt(input_ids: list[int], target_length: int):
+    num = len(input_ids)
+    repeat_times = (target_length + num - 1) // num
+    return (input_ids * repeat_times)[:target_length]
+
+
+class TestModel:
+    model: infinicore.nn.Module
+    tokenizer: AutoTokenizer
+    input_ids_list: list[int]
+
+    def __init__(
+        self,
+        model_path,
+        infini_device=infinicore.device("cpu", 0),
+        tp=1,
+        skip_load=False,
+        cache_config=None,
+        enable_graph=False,
+        attn_backend="default",
+    ) -> None:
+        model_path = os.path.expanduser(model_path)
+        # ---------------------------------------------------------------------------- #
+        #                        创建模型,
+        # ---------------------------------------------------------------------------- #
+        model = InferEngine(
+            model_path,
+            device=infini_device,
+            distributed_config=DistConfig(tp),
+            cache_config=cache_config,
+            enable_graph_compiling=enable_graph,
+            attention_backend=attn_backend,
+        )
+
+        # ---------------------------------------------------------------------------- #
+        #                        加载权重
+        # ---------------------------------------------------------------------------- #
+        if not skip_load:
+            load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
+
+        # ---------------------------------------------------------------------------- #
+        #                        创建 tokenizer
+        # ---------------------------------------------------------------------------- #
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+        if tokenizer.pad_token is None:
+            if tokenizer.eos_token is not None:
+                tokenizer.pad_token = tokenizer.eos_token
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+        # ---------------------------------------------------------------------------- #
+        #                        token编码
+        # ---------------------------------------------------------------------------- #
+        input_content = [
+            tokenizer.apply_chat_template(
+                conversation=[{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+        ]
+
+        # print(input_content, end="", flush=True)
+        # Support Transformers >= 5.0 for batch_encode_plus deprecation
+        encoding = tokenizer(
+            input_content,
+            padding=True,
+            truncation=True,
+            max_length=8192,
+        )
+
+        input_ids_list = encoding["input_ids"]
+
+        self.model = model
+        self.tokenizer = tokenizer
+        self.input_ids_list = input_ids_list
+
+    def run(
+        self,
+        batch_size: int,
+        input_len: int,
+        output_len: int,
+        top_k=1,
+        top_p=1.0,
+        temperature=1.0,
+    ):
+        input_ids = repeat_prompt(self.input_ids_list[0], target_length=input_len)
+        input_ids_list = [input_ids] * batch_size
+
+        # ---------------------------------------------------------------------------- #
+        #                        自回归生成
+        # ---------------------------------------------------------------------------- #
+        input_ids_infini = infinicore.from_list(input_ids_list)
+
+        t1 = time.time()
+        print("=================== start generate ====================")
+        output_ids = self.model.generate(
+            input_ids_infini,
+            GenerationConfig(
+                max_new_tokens=output_len,
+                eos_token_id=[],
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                stop_on_eos=False,
+            ),
+            _measure_and_log_time=True,
+        )
+        t2 = time.time()
+
+        numpy_output_ids = np.array(
+            [output_id.to_numpy()[0] for output_id in output_ids]
+        )
+        print(self.tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
+
+        print(
+            f"total_time: {round((t2 - t1) * 1000, 2)} ms",
+        )
+
+
+if __name__ == "__main__":
+    args = get_args()
+    print(args)
+
+    # Parse command line arguments
+    device_str = "cpu"
+    if args.cpu:
+        device_str = "cpu"
+    elif args.nvidia:
+        device_str = "cuda"
+    elif args.qy:
+        device_str = "cuda"
+    elif args.metax:
+        device_str = "cuda"
+    elif args.moore:
+        device_str = "musa"
+    elif args.iluvatar:
+        device_str = "cuda"
+    elif args.cambricon:
+        device_str = "mlu"
+    elif args.ali:
+        device_str = "cuda"
+    elif args.hygon:
+        device_str = "cuda"
+    else:
+        print(
+            "python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"
+        )
+        sys.exit(1)
+    _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size
+    # -------------------------------------------------------- #
+    #             解析参数
+    # -------------------------------------------------------- #
+    model_path = args.model
+
+    infini_device = infinicore.device(device_str, 0)
+
+    tp = args.tensor_parallel_size
+
+    skip_load = args.skip_load
+
+    batch_size = args.batch_size
+    input_len = args.input_len
+    output_len = args.output_len
+    enable_paged_attn = args.enable_paged_attn
+    enable_graph = args.enable_graph
+
+    if isinstance(batch_size, int):
+        batch_size = [batch_size]
+
+    if isinstance(input_len, int):
+        input_len = [input_len]
+
+    if isinstance(output_len, int):
+        output_len = [output_len]
+
+    cases_dict = get_test_cases(model_path, batch_size, input_len, output_len)
+    # -------------------------------------------------------- #
+    #             测试
+    # -------------------------------------------------------- #
+    if enable_paged_attn:
+        paged_kv_block_size = _PAGED_KV_BLOCK_SIZE
+        max_num_blocks = max(
+            [
+                (
+                    (c_["input_len"] + c_["output_len"] + (paged_kv_block_size - 1))
+                    // paged_kv_block_size
+                )
+                * c_["batch_size"]
+                for _, c_ in cases_dict.items()
+            ]
+        )
+        cache_config = PagedKVCacheConfig(max_num_blocks, paged_kv_block_size)
+    else:
+        cache_config = None
+
+    test = TestModel(
+        model_path,
+        infini_device=infini_device,
+        tp=tp,
+        skip_load=skip_load,
+        cache_config=cache_config,
+        enable_graph=enable_graph,
+        attn_backend=args.attn,
+    )
+
+    # ---------------------------------------------------------------------------- #
+    #                                Warmup
+    # ---------------------------------------------------------------------------- #
+    if args.warmup:
+        warmup_steps = 1
+
+        # warmup cache capacity
+        warmup_cache_len = 128
+        warmup_batch = len(test.input_ids_list)
+
+        test.model.reset_cache(
+            StaticKVCacheConfig(
+                max_batch_size=warmup_batch,
+                max_cache_len=warmup_cache_len,
+            )
+        )
+
+        avg_prompt_len = min(64, max(len(ids) for ids in test.input_ids_list))
+
+        warmup_ids = [
+            ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
+            for ids in test.input_ids_list
+        ]
+
+        input_ids_infini = infinicore.from_list(warmup_ids)
+
+        print("=================== warmup start ===================")
+
+        for _ in range(warmup_steps):
+            _ = test.model.generate(
+                input_ids_infini,
+                GenerationConfig(
+                    max_new_tokens=5,  # decode kernel warmup
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    stop_on_eos=False,
+                ),
+                _measure_and_log_time=False,
+            )
+
+        print("=================== warmup done ====================")
+
+        # reset cache back to benchmark config
+        if cache_config is not None:
+            test.model.reset_cache(cache_config)
+
+    # ---------------------------------------------------------------------------- #
+    #                                Warmup done
+    # ---------------------------------------------------------------------------- #
+
+    for idx, case in tqdm(cases_dict.items(), desc="Processing cases"):
+        tqdm.write(f"\033[92mProcessing : {case}\033[0m")
+
+        batch_size = case["batch_size"]
+        input_len = case["input_len"]
+        output_len = case["output_len"]
+
+        if not enable_paged_attn:
+            # reset cache if static kvcache is used
+            initial_capacity = input_len + output_len
+            test.model.reset_cache(
+                StaticKVCacheConfig(
+                    max_batch_size=batch_size, max_cache_len=initial_capacity
+                )
+            )
+
+        # run test one case
+        # test.run(
+        #     batch_size=batch_size,
+        #     input_len=input_len,
+        #     output_len=output_len,
+        #     top_k=args.top_k,
+        #     top_p=args.top_p,
+        #     temperature=args.temperature,
+        # )
+
+        # run test one case
+        log_file=f"/home/qi-yuan-2026/z_prof_logs/8b_fa_1_4096_256_prof.log"
+        json_file=f"/home/qi-yuan-2026/z_prof_logs/8b_fa_1_4096_256_prof.json"
+        # log_file=f"/root/qi_yuan/z_prof_logs/70b_1_32_256_prof.log"
+        # json_file=f"/root/qi_yuan/z_prof_logs/70b_1_32_256_prof.json"
+        with torch_prof(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],record_shapes=True,profile_memory=False,with_stack=True) as prof:
+            with torch.no_grad():
+                test.run(
+                    batch_size=batch_size,
+                    input_len=input_len,
+                    output_len=output_len,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    temperature=args.temperature,
+                )
+        with open(log_file,'w')   as log_file:    
+            log_file.write(prof.key_averages().table(sort_by="self_cuda_time_total"))       
+        print(prof.key_averages().table(sort_by="self_cuda_time_total"))
+        prof.export_chrome_trace(json_file)
--- a/scripts/build.sh
+++ b/scripts/build.sh
+#!/bin/bash
+
+xmake f --use-kv-caching=true -cv
+
+pip install -e . --no-build-isolation
\ No newline at end of file
--- a/scripts/run_infer.sh
+++ b/scripts/run_infer.sh
+cd InfiniLM-fa                                                                                                                     
+                                                          
+# Basic inference                                                                                                                  
+python examples/jiuge.py --hygon --model_path=../models/9g_8b_thinking_llama/
+
+# With paged attention + flash-attn (use block_size=64 for Hygon)
+python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --model_path=../models/9g_8b_thinking_llama/
+
+# With graph compilation
+python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --enable-graph --model_path=../models/9g_8b_thinking_llama/
+
+# Multi-GPU (tensor parallel)
+python examples/jiuge.py --hygon --tp=4 --model_path=../models/9g_8b_thinking_llama/
+
+# Custom options
+python examples/jiuge.py --hygon --model_path=<path> \
+    --max_new_tokens=100 \
+    --batch_size=1 \
+    --prompt="How are you" \
+    --top_k=1 --top_p=1.0 --temperature=1.0
+
+python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \
+  --enable-paged-attn --attn=flash-attn --enable-graph \
+  --input-len=32,256,4096 --output-len=256,1024,2048,4096 \
+  --batch-size=1 --skip-load >> perf_hygon_bs_1.txt
+
+python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \
+  --enable-paged-attn --attn=flash-attn --enable-graph \
+  --input-len=32,256 --output-len=256,1024,2048,4096 \
+  --batch-size=16 --skip-load >> perf_hygon_bs_16.txt
\ No newline at end of file