issue/168 use n_blocks to init paged kv cache config, support fixed paged caching api

db19cc0b · PanZezhong · 831e8a67 · db19cc0b · db19cc0b · db19cc0b
Commit db19cc0b authored Jan 09, 2026 by PanZezhong
5 changed files
--- a/csrc/cache/kv_cache.cpp
+++ b/csrc/cache/kv_cache.cpp
@@ -111,9 +111,9 @@ StaticKVCache::update(size_t layer_idx,
 // PagedKVCacheConfig
 // ==========================
 PagedKVCacheConfig::PagedKVCacheConfig(
-    size_t max_kv_memory_bytes,
+    size_t num_blocks,
    size_t block_size)
-    : max_kv_memory_bytes_(max_kv_memory_bytes),
+    : num_blocks_(num_blocks),
      block_size_(block_size) {
 }
@@ -123,8 +123,8 @@ PagedKVCacheConfig::unique_copy() const {
 }
 size_t
-PagedKVCacheConfig::max_kv_memory_bytes() const {
+PagedKVCacheConfig::num_blocks() const {
-    return max_kv_memory_bytes_;
+    return num_blocks_;
 }
 size_t
@@ -151,16 +151,8 @@ PagedKVCache::PagedKVCache(
      num_rank_v_heads_(num_v_heads / rank_info.tp_size),
      rank_num_layers_(num_layers),
      dtype_(dtype),
+      num_blocks_per_layer_(config.num_blocks()),
      block_size_(config.block_size()) {
-    num_blocks_per_layer_ = config.max_kv_memory_bytes()
-                          / (k_dim * num_rank_k_heads_ + v_dim * num_rank_v_heads_)
-                          / block_size_
-                          / rank_num_layers_
-                          / infinicore::dsize(dtype_);
-    if (num_blocks_per_layer_ == 0) {
-        throw std::runtime_error("Not enough memory for KV cache");
-    }
    // [num_layers, num_blocks, num_rank_k_heads, block_size, k_dim]
    k_caches_ = infinicore::Tensor::empty(
        {rank_num_layers_,
@@ -190,11 +182,12 @@ std::tuple<infinicore::Tensor, infinicore::Tensor> PagedKVCache::update(
    auto &&[k_cache_layer, v_cache_layer] = get_paged_kv(layer_idx);
-    infinicore::op::paged_caching_(k,
+    infinicore::op::paged_caching_(
-                                   v,
+        k_cache_layer,
-                                   k_cache_layer,
+        v_cache_layer,
-                                   v_cache_layer,
+        k,
-                                   slot_mapping);
+        v,
+        slot_mapping);
    return {k_cache_layer, v_cache_layer};
 }

--- a/csrc/cache/kv_cache.hpp
+++ b/csrc/cache/kv_cache.hpp
@@ -85,15 +85,15 @@ private:
 class PagedKVCacheConfig final : public CacheConfig {
 public:
    PagedKVCacheConfig(
-        size_t max_kv_memory_bytes,
+        size_t num_blocks,
        size_t block_size = 16);
    std::unique_ptr<CacheConfig> unique_copy() const override;
-    size_t max_kv_memory_bytes() const;
+    size_t num_blocks() const;
    size_t block_size() const;
 private:
-    size_t max_kv_memory_bytes_;
+    size_t num_blocks_;
    size_t block_size_;
 };

--- a/csrc/pybind11/cache/cache.hpp
+++ b/csrc/pybind11/cache/cache.hpp
@@ -36,11 +36,11 @@ inline void bind_cache(py::module &m) {
               std::shared_ptr<infinilm::cache::PagedKVCacheConfig>>(m, "PagedKVCacheConfig")
        .def(
            py::init<size_t, size_t>(),
-            py::arg("max_kv_memory_bytes"),
+            py::arg("num_blocks"),
            py::arg("block_size") = 16)
        .def(
-            "max_kv_memory_bytes",
+            "num_blocks",
-            &infinilm::cache::PagedKVCacheConfig::max_kv_memory_bytes)
+            &infinilm::cache::PagedKVCacheConfig::num_blocks)
        .def(
            "block_size",
            &infinilm::cache::PagedKVCacheConfig::block_size)

--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -89,13 +89,6 @@ def get_args():
        help="use paged cache",
    )
-    parser.add_argument(
-        "--max-kvcache-size",
-        type=int,
-        default=8 * 1024 * 1024 * 1024,
-        help="max size (in bytes) allocated to paged kv cache",
-    )
    return parser.parse_args()
@@ -109,7 +102,7 @@ def test(
 ):
    model_path = os.path.expanduser(model_path)
    # ---------------------------------------------------------------------------- #
-    #                        创建模型,
+    #                        Create Model
    # ---------------------------------------------------------------------------- #
    model = InferEngine(
        model_path,
@@ -118,12 +111,12 @@ def test(
    )
    # ---------------------------------------------------------------------------- #
-    #                        加载权重
+    #                        Load Weights
    # ---------------------------------------------------------------------------- #
    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
    # ---------------------------------------------------------------------------- #
-    #                        创建 tokenizer
+    #                        create tokenizer
    # ---------------------------------------------------------------------------- #
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -146,7 +139,7 @@ def test(
            )
    # ---------------------------------------------------------------------------- #
-    #                        token编码
+    #                        tokenize
    # ---------------------------------------------------------------------------- #
    # prompt = "山东最高的山是？"
    if isinstance(prompts, str):
@@ -165,11 +158,13 @@ def test(
    ]  # List: [[1, 1128, 526, 366, 29892]]
    # ---------------------------------------------------------------------------- #
-    #                        创建KVCache
+    #                       Create KVCache
    # ---------------------------------------------------------------------------- #
    if enable_paged_attn:
+        batch_size = 1 if prompts is str else len(prompts)
+        max_total_tokens = max_new_tokens + len(input_ids_list[0])
        cache_config = PagedKVCacheConfig(
-            max_kv_memory_bytes=args.max_kvcache_size, block_size=16
+            num_blocks=(max_total_tokens // 16 + 1) * batch_size, block_size=16
        )
    else:
        batch_size = 1 if prompts is str else len(prompts)
@@ -181,7 +176,7 @@ def test(
    model.reset_cache(cache_config)
    # ---------------------------------------------------------------------------- #
-    #                        自回归生成
+    #                        Generate
    # ---------------------------------------------------------------------------- #
    print(input_contents[0], end="", flush=True)
    input_ids_infini = infinicore.from_list(input_ids_list)

--- a/python/infinilm/cache/cache.py
+++ b/python/infinilm/cache/cache.py
@@ -16,11 +16,11 @@ class StaticKVCacheConfig(CacheConfig, _infinilm.StaticKVCacheConfig):
 class PagedKVCacheConfig(CacheConfig, _infinilm.PagedKVCacheConfig):
    def __init__(
        self,
-        max_kv_memory_bytes: int,
+        num_blocks: int,
        block_size: int = 16,
    ):
        _infinilm.PagedKVCacheConfig.__init__(
            self,
-            max_kv_memory_bytes,
+            num_blocks,
            block_size,
        )