Merge pull request #1013 from kvcache-ai/work-concurrent

In v0.2.4 version, we’ve added highly desired multi-concurrency support to the community through a major refactor of the whole architecture.

Merge pull request #1013 from kvcache-ai/work-concurrent
In v0.2.4 version, we’ve added highly desired multi-concurrency support to the community through a major refactor of the whole architecture.
a41d2163 · wang jiahao · GitHub · f142f4df · 4ed9744e · a41d2163
Unverified Commit a41d2163 authored Apr 02, 2025 by wang jiahao Committed by GitHub Apr 02, 2025
20 changed files
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 09:52:48
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-25 07:51:09
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "common.hpp"
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+// complete same
+#pragma omp parallel for
+  for (size_t ti = 0; ti < 3; ti++) {
+    auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                                 ids1.size() + 2 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 10);
+    cmp_handle_data(v1, v, 10);
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    auto k_from_gpu = empty_kvcache(15);
+    auto v_from_gpu = empty_kvcache(15);
+    size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+    size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+    for (size_t i = 0; i < k_from_gpu.size(); i++) {
+      for (size_t j = 0; j < block_idx.size(); j++) {
+        size_t b_idx = block_idx[j];
+        for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+          {
+            auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = kt.data_ptr();
+            void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+          {
+            auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = vt.data_ptr();
+            void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+        }
+      }
+    }
+    cmp_handle_data(k1, k_from_gpu, 10);
+    cmp_handle_data(v1, v_from_gpu, 10);
+  }
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 09:52:48
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-25 08:38:33
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "common.hpp"
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+  // complete same
+  {
+    auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                                 ids1.size() + 5 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 10);
+    cmp_handle_data(v1, v, 10);
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    auto k_from_gpu = empty_kvcache(15);
+    auto v_from_gpu = empty_kvcache(15);
+    size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+    size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+    for (size_t i = 0; i < k_from_gpu.size(); i++) {
+      for (size_t j = 0; j < block_idx.size(); j++) {
+        size_t b_idx = block_idx[j];
+        for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+          {
+            auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = kt.data_ptr();
+            void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+          {
+            auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = vt.data_ptr();
+            void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+        }
+      }
+    }
+    cmp_handle_data(k1, k_from_gpu, 10);
+    cmp_handle_data(v1, v_from_gpu, 10);
+  }
+  // prefix and evict
+  {
+    auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), config.num_token_per_page * 3,
+                                 config.gpu_cache_config->total_kvcache_pages * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 3);
+    cmp_handle_data(v1, v, 3);
+    auto block_idx = h->get_gpu_block_idx();
+    auto [kcache, vcache] = kvc2->get_kvcache();
+    auto k_from_gpu = empty_kvcache(3);
+    auto v_from_gpu = empty_kvcache(3);
+    size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
+    size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
+    for (size_t i = 0; i < k_from_gpu.size(); i++) {
+      for (size_t j = 0; j < 3; j++) {
+        size_t b_idx = block_idx[j];
+        for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
+          {
+            auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = kt.data_ptr();
+            void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+          {
+            auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
+            void* src = vt.data_ptr();
+            void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
+            memcpy(dst, src, element_size_per_gpu);
+          }
+        }
+      }
+    }
+    cmp_handle_data(k1, k_from_gpu, 3);
+    cmp_handle_data(v1, v_from_gpu, 3);
+  }
+  // // complete prefix
+  // {
+  //   std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
+  //                         ids2.size() + 3 * config.num_token_per_page);
+  //   auto k = h->handle_data(true);
+  //   auto v = h->handle_data(false);
+  //   cmp_handle_data(k1, k, 3);
+  //   cmp_handle_data(v1, v, 3);
+  // }
+  // // common prefix
+  // {
+  //   std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //   auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+  //   ids2.insert(ids2.end(), rids.begin(), rids.end());
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //   auto k = h->handle_data(true);
+  //   auto v = h->handle_data(false);
+  //   cmp_handle_data(k1, k, 3);
+  //   cmp_handle_data(v1, v, 3);
+  // }
+  // // no prefix
+  // {
+  //   std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //   assert(h->matched_length() == 0);
+  // }
+  // // insert partly new
+  // auto k2 = random_kvcache(10, gen);
+  // auto v2 = random_kvcache(10, gen);
+  // copy_kvcache(k1, k2, 0, 5);
+  // copy_kvcache(v1, v2, 0, 5);
+  // auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  // for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+  //   ids2[i] = ids1[i];
+  // }
+  // kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+  // // read new part
+  // {
+  //   std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+  //   auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
+  //                         ids.size() + 7 * config.num_token_per_page);
+  //   auto k = h->handle_data(true);
+  //   auto v = h->handle_data(false);
+  //   cmp_handle_data(k, k2, 7);
+  //   cmp_handle_data(v, v2, 7);
+  // }
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-mt.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-mt.cpp
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:48:40
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:53:06
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "common.hpp"
+template <typename F>
+void test_multi(F f) {
+  std::vector<std::thread> threads;
+  for (size_t i = 0; i < 10; i++) {
+    threads.push_back([f]() { f(); });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(3 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(3, gen);
+  auto v1 = random_kvcache(3, gen);
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+  // complete same
+  {
+#pragma omp parallel for
+    for (size_t i = 0; i < 10; i++) {
+      auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                            ids1.size() + 10 * config.num_token_per_page);
+      if (h == nullptr) {
+        SPDLOG_WARN("Thread[{}]: h is nullptr", i);
+      } else {
+        auto k = h->handle_data(true);
+        auto v = h->handle_data(false);
+        cmp_handle_data(k1, k, 3);
+        cmp_handle_data(v1, v, 3);
+      }
+    }
+  }
+  //  // complete prefix
+  //   {
+  //     std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size() + 3 *
+  //     config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k1,
+  //     k, 3); cmp_handle_data(v1, v, 3);
+  //   }
+  //   // common prefix
+  //   {
+  //     std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+  //     auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+  //     ids2.insert(ids2.end(), rids.begin(), rids.end());
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //     auto k = h->handle_data(true);
+  //     auto v = h->handle_data(false);
+  //     cmp_handle_data(k1, k, 3);
+  //     cmp_handle_data(v1, v, 3);
+  //   }
+  //   // no prefix
+  //   {
+  //     std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+  //     assert(h->matched_length() == 0);
+  //   }
+  //   // insert partly new
+  //   auto k2 = random_kvcache(10, gen);
+  //   auto v2 = random_kvcache(10, gen);
+  //   copy_kvcache(k1, k2, 0, 5);
+  //   copy_kvcache(v1, v2, 0, 5);
+  //   auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  //   for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+  //     ids2[i] = ids1[i];
+  //   }
+  //   kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+  //   // read new part
+  //   {
+  //     std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+  //     auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(), ids.size() + 7 *
+  //     config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k,
+  //     k2, 7); cmp_handle_data(v, v2, 7);
+  //   }
+  kvc2->debug();
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup-without-vcache.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup-without-vcache.cpp
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:29:45
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:56:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "common.hpp"
+int main(int argc, char* argv[]) {
+  qw25_7B_gpu_config.v_cache_on = false;
+  config.gpu_cache_config = qw25_7B_gpu_config;
+  config.v_cache_on = false;
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  // auto v1 = random_kvcache(10, gen);
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
+  // complete same
+  {
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                          ids1.size() + 10 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    cmp_handle_data(k1, k, 10);
+  }
+  // complete prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
+                          ids2.size() + 3 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    cmp_handle_data(k1, k, 3);
+  }
+  // common prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    auto k = h->handle_data(true);
+    cmp_handle_data(k1, k, 3);
+  }
+  // no prefix
+  {
+    std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    assert(h->matched_length() == 0);
+  }
+  // insert partly new
+  auto k2 = random_kvcache(10, gen);
+  copy_kvcache(k1, k2, 0, 5);
+  auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+    ids2[i] = ids1[i];
+  }
+  kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, {});
+  // read new part
+  {
+    std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
+                          ids.size() + 7 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    cmp_handle_data(k, k2, 7);
+  }
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/lookup.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/lookup.cpp
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 08:29:45
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 09:56:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "common.hpp"
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+  // complete same
+  {
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
+                          ids1.size() + 10 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 10);
+    cmp_handle_data(v1, v, 10);
+  }
+  // complete prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
+                          ids2.size() + 3 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 3);
+    cmp_handle_data(v1, v, 3);
+  }
+  // common prefix
+  {
+    std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k1, k, 3);
+    cmp_handle_data(v1, v, 3);
+  }
+  // no prefix
+  {
+    std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
+    assert(h->matched_length() == 0);
+  }
+  // insert partly new
+  auto k2 = random_kvcache(10, gen);
+  auto v2 = random_kvcache(10, gen);
+  copy_kvcache(k1, k2, 0, 5);
+  copy_kvcache(v1, v2, 0, 5);
+  auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+    ids2[i] = ids1[i];
+  }
+  kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+  // read new part
+  {
+    std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+    auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
+                          ids.size() + 7 * config.num_token_per_page);
+    auto k = h->handle_data(true);
+    auto v = h->handle_data(false);
+    cmp_handle_data(k, k2, 7);
+    cmp_handle_data(v, v2, 7);
+  }
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvc2test/raw_insert_read.cpp
+++ b/csrc/balance_serve/kvc2/test/kvc2test/raw_insert_read.cpp
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-11-22 06:00:16
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-11-22 07:30:46
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "common.hpp"
+int main(int argc, char* argv[]) {
+  init(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  auto kvc2 = kvc2::create_kvc2(config);
+  std::mt19937 gen(123);
+  auto ids1 = random_ids(10 * config.num_token_per_page, gen);
+  auto k1 = random_kvcache(10, gen);
+  auto v1 = random_kvcache(10, gen);
+  kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
+  // complete same
+  {
+    auto k2 = empty_kvcache(10);
+    auto v2 = empty_kvcache(10);
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids1.data(), ids1.size(), k2, v2);
+    assert(l2 == ids1.size());
+    cmp_handle_data(k1, k2);
+    cmp_handle_data(v1, v2);
+  }
+  // complete prefix
+  {
+    auto k2 = empty_kvcache(10);
+    auto v2 = empty_kvcache(10);
+    std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+    assert(l2 == 3 * config.num_token_per_page);
+    cmp_handle_data(k1, k2, 3);
+    cmp_handle_data(v1, v2, 3);
+  }
+  // common prefix
+  {
+    auto k2 = empty_kvcache(10);
+    auto v2 = empty_kvcache(10);
+    std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
+    auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
+    ids2.insert(ids2.end(), rids.begin(), rids.end());
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+    assert(l2 == 3 * config.num_token_per_page);
+    cmp_handle_data(k1, k2, 3);
+    cmp_handle_data(v1, v2, 3);
+  }
+  // no prefix
+  {
+    auto k2 = empty_kvcache(1);
+    auto v2 = empty_kvcache(1);
+    std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
+    auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+    assert(l2 == 0);
+  }
+  // insert partly new
+  auto k2 = random_kvcache(10, gen);
+  auto v2 = random_kvcache(10, gen);
+  copy_kvcache(k1, k2, 0, 5);
+  copy_kvcache(v1, v2, 0, 5);
+  auto ids2 = random_ids(10 * config.num_token_per_page, gen);
+  for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
+    ids2[i] = ids1[i];
+  }
+  kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
+  // read new part
+  {
+    auto k = empty_kvcache(10);
+    auto v = empty_kvcache(10);
+    std::vector<Token> ids = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
+    auto l = kvc2->raw_read(test_model_name, test_quant_type, ids.data(), ids.size(), k, v);
+    assert(l == 7 * config.num_token_per_page);
+    cmp_handle_data(k, k2, 7);
+    cmp_handle_data(v, v2, 7);
+  }
+  SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/kvcache_disk_insert_read_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_disk_insert_read_test.cpp
+#include "kvcache_test_utils.cpp"
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+  KVC2 kvc2(FLAGS_disk_cache_path);
+  // auto io = kvc2.io_dealer->start_io_thread();
+  kvc2.io_dealer->start_io_thread().detach();
+  auto h1 = random_kvcache(qwen_cache_info, 10, gen);
+  h1.ids = random_ids(10 * BlockLength, gen);
+  kvc2.raw_insert(h1);
+  // complete same
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = h1.ids;
+    kvc2.raw_read(h2);
+    assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
+    cmp_handle_data(h1, h2);
+  }
+  // complete prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 3 * BlockLength);
+    cmp_handle_data(h1, h2, 3);
+  }
+  // common prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
+    auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+    h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 5 * BlockLength);
+    cmp_handle_data(h1, h2, 5);
+  }
+  // no prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = random_ids(10 * BlockLength, gen);
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 0);
+  }
+  // insert partly new
+  auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+  copy_kvcache(h1, h2, 0, 5);
+  h2.ids = random_ids(10 * BlockLength, gen);
+  for (size_t i = 0; i < 5 * BlockLength; i++) {
+    h2.ids[i] = h1.ids[i];
+  }
+  kvc2.raw_insert(h2);
+  // read new part
+  {
+    auto h = empty_kvcache(qwen_cache_info, 10);
+    h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
+    h.ids.push_back(123);
+    kvc2.raw_read(h);
+    assert(h.match.match_length == 7 * BlockLength);
+    cmp_handle_data(h, h2, 7);
+  }
+  kvc2.tree->debug();
+  kvc2.io_dealer->stop();
+  // io.join();
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp
+#include "kvcache_test_utils.cpp"
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+  KVC2 kvc2(FLAGS_disk_cache_path);
+  auto io = kvc2.io_dealer->start_io_thread();
+  SPDLOG_WARN("Insert 10 x 10 KVCache");
+  std::vector<KVCacheHandle> handles(10);
+  for (int i = 0; i < 10; i++) {
+    handles[i] = random_kvcache(qwen_cache_info, 10, gen);
+    auto& h1 = handles[i];
+    h1.ids = random_ids(10 * BlockLength, gen);
+    kvc2.raw_insert(h1);
+  }
+  SPDLOG_WARN("Cache Eviction Test");
+  {
+    for (int i = 0; i < 10; i++) {
+      auto& h = handles[i];
+      SPDLOG_WARN("Lookup {}", i);
+      auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
+      cmp_handle_data(h, *x);
+    }
+    SPDLOG_WARN("Simple Eviction OK");
+  }
+  {
+    std::vector<std::shared_ptr<KVCacheHandle>> lookup_handles;
+    for (int i = 0; i < 10; i++) {
+      auto& h = handles[i];
+      SPDLOG_WARN("Lookup {}", i);
+      auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
+      if (i >= 5) {
+        assert(x == nullptr);
+        continue;
+      }
+      lookup_handles.push_back(x);
+      cmp_handle_data(h, *x);
+    }
+    SPDLOG_WARN("Cannot Eviction OK");
+  }
+  kvc2.io_dealer->stop();
+  io.join();
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/test/kvcache_mem_insert_read_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_mem_insert_read_test.cpp
+#include "kvcache_test_utils.cpp"
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+  KVC2 kvc2(FLAGS_disk_cache_path);
+  auto io = kvc2.io_dealer->start_io_thread();
+  SPDLOG_INFO("Disk Test");
+  auto h1 = random_kvcache(qwen_cache_info, 10, gen);
+  h1.ids = random_ids(10 * BlockLength, gen);
+  kvc2.raw_insert(h1);
+  // complete same
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = h1.ids;
+    kvc2.raw_read(h2);
+    assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
+    cmp_handle_data(h1, h2);
+  }
+  // complete prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 3 * BlockLength);
+    cmp_handle_data(h1, h2, 3);
+  }
+  // common prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
+    auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+    h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 5 * BlockLength);
+    cmp_handle_data(h1, h2, 5);
+  }
+  // no prefix
+  {
+    auto h2 = empty_kvcache(qwen_cache_info, 10);
+    h2.ids = random_ids(10 * BlockLength, gen);
+    kvc2.raw_read(h2);
+    assert(h2.match.match_length == 0);
+  }
+  // insert partly new
+  auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+  copy_kvcache(h1, h2, 0, 5);
+  h2.ids = random_ids(10 * BlockLength, gen);
+  for (size_t i = 0; i < 5 * BlockLength; i++) {
+    h2.ids[i] = h1.ids[i];
+  }
+  kvc2.raw_insert(h2);
+  // read new part
+  {
+    auto h = empty_kvcache(qwen_cache_info, 10);
+    h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
+    h.ids.push_back(123);
+    kvc2.raw_read(h);
+    assert(h.match.match_length == 7 * BlockLength);
+    cmp_handle_data(h, h2, 7);
+  }
+  SPDLOG_WARN("Memory Test");
+  {
+    auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), h1.ids.size());
+    assert(h);
+    cmp_handle_data(h1, *h);
+    kvc2.block_cache->debug();
+  }
+  kvc2.block_cache->debug();
+  {
+    auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), 5 * BlockLength);
+    assert(h);
+    cmp_handle_data(h1, *h, 5);
+    kvc2.block_cache->debug();
+  }
+  kvc2.block_cache->debug();
+  kvc2.io_dealer->stop();
+  io.join();
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/test/kvcache_save_load_test.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_save_load_test.cpp
+#include "kvcache_test_utils.cpp"
+int main(int argc, char* argv[]) {
+  parse_and_check(argc, argv);
+  spdlog::set_level(spdlog::level::debug);
+  std::mt19937 gen(123);
+  std::vector<KVCacheHandle> handles(10);
+  {
+    KVC2 kvc2(FLAGS_disk_cache_path);
+    auto io = kvc2.io_dealer->start_io_thread();
+    SPDLOG_WARN("Insert 10 x 10 KVCache");
+    for (int i = 0; i < 10; i++) {
+      handles[i] = random_kvcache(qwen_cache_info, 10, gen);
+      auto& h1 = handles[i];
+      h1.ids = random_ids(10 * BlockLength, gen);
+      kvc2.raw_insert(h1);
+    }
+    kvc2.save();
+    kvc2.tree->debug();
+    kvc2.io_dealer->stop();
+    io.join();
+  }
+  {
+    KVC2 kvc2(FLAGS_disk_cache_path);
+    auto io = kvc2.io_dealer->start_io_thread();
+    kvc2.load();
+    kvc2.tree->debug();
+    auto& h1 = handles[0];
+    // complete same
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+      h2.ids = h1.ids;
+      kvc2.raw_read(h2);
+      assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
+      cmp_handle_data(h1, h2);
+    }
+    // complete prefix
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+      h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
+      kvc2.raw_read(h2);
+      assert(h2.match.match_length == 3 * BlockLength);
+      cmp_handle_data(h1, h2, 3);
+    }
+    // common prefix
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+      h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
+      auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
+      h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
+      kvc2.raw_read(h2);
+      assert(h2.match.match_length == 5 * BlockLength);
+      cmp_handle_data(h1, h2, 5);
+    }
+    // no prefix
+    {
+      auto h2 = empty_kvcache(qwen_cache_info, 10);
+      h2.ids = random_ids(10 * BlockLength, gen);
+      kvc2.raw_read(h2);
+      assert(h2.match.match_length == 0);
+    }
+    // insert partly new
+    auto h2 = random_kvcache(qwen_cache_info, 10, gen);
+    copy_kvcache(h1, h2, 0, 5);
+    h2.ids = random_ids(10 * BlockLength, gen);
+    for (size_t i = 0; i < 5 * BlockLength; i++) {
+      h2.ids[i] = h1.ids[i];
+    }
+    kvc2.raw_insert(h2);
+    // read new part
+    {
+      auto h = empty_kvcache(qwen_cache_info, 10);
+      h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
+      h.ids.push_back(123);
+      kvc2.raw_read(h);
+      assert(h.match.match_length == 7 * BlockLength);
+      cmp_handle_data(h, h2, 7);
+    }
+    kvc2.io_dealer->stop();
+    io.join();
+  }
+  SPDLOG_WARN("{} Test Passed", __FILE__);
+  return 0;
+}
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp
+++ b/csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp
--- a/csrc/balance_serve/kvc2/test/page_pool_test.cpp
+++ b/csrc/balance_serve/kvc2/test/page_pool_test.cpp
+#include <iostream>
+#include <thread>
+#include <vector>
+#include <random>
+#include <unistd.h>
+#include "page_aligned_memory_pool.cpp"
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+// 每个线程执行的任务
+void thread_task(PageAlignedMemoryPool& pool) {
+  std::mt19937 gen(123);
+  std::vector<std::pair<void*, size_t>> allocated;
+  size_t cnt = 40000;
+  for (size_t i = 0; i < cnt; ++i) {
+    // 随机分配一个大小
+    size_t size = (gen() % 100 + 1) * 4096 * 4;
+    void* ptr = pool.alloc(size);
+    // SPDLOG_DEBUG(pool.debug());
+    if (ptr) {
+        pool.free(ptr, size);
+    //   allocated.push_back({ptr, size});
+    }
+    // sleep((int)(gen() % 1000) / 1000.0);
+  }
+  // free all memory
+  for (auto& p : allocated) {
+    pool.free(p.first, p.second);
+  }
+}
+int main(int argc, char* argv[]) {
+  spdlog::set_level(spdlog::level::debug);
+  // 创建一个内存池
+  PageAlignedMemoryPool pool(40ll * 1024 * 1024 * 1024); // 40 G
+  // 创建线程
+  const int num_threads = 32;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < num_threads; ++i) {
+      threads.emplace_back(thread_task, std::ref(pool));
+  }
+  // 等待所有线程完成
+  for (auto& t : threads) {
+      t.join();
+  }
+  // 输出调试信息
+  std::cout << pool.debug() << std::endl;
+  return 0;
+}
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/test/prefix_test.cpp
+++ b/csrc/balance_serve/kvc2/test/prefix_test.cpp
--- a/csrc/balance_serve/kvc2/test/pytest_load.py
+++ b/csrc/balance_serve/kvc2/test/pytest_load.py
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import get_tensor_from_data_ptr
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+kvc2_ext.load(kvc2_instance)
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+print("Element size: ", test_info.element_size())
+# Generate random test IDs (length = 2560)
+torch.manual_seed(123)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
+    handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
+    if kvc2_ext.is_nullptr(handle):
+        print("Handle is nullptr.")
+        exit()
+    matched_length = kvc2_ext.matched_length(handle)
+    matched_data = kvc2_ext.handle_data(handle)
+    print('Matched length: ', matched_length)
+    if matched_length >0:
+        print(f'First layer address {[hex(x) for x in matched_data[0]]}')
+    read_data = get_tensor_from_data_ptr(matched_data,element_size)
+    print("Just read check ok.")
+    kvc2_ext.release(handle)
+l = 128
+while l<=length:
+    read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
+    l+=128
+kvc2_ext.destroy_kvc2(kvc2_instance)
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/pytest_mem_prefix_test.py
+++ b/csrc/balance_serve/kvc2/test/pytest_mem_prefix_test.py
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+print("Element size: ", test_info.element_size())
+# Generate random test IDs (length = 2560)
+torch.manual_seed(123)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+# print(test_data,test_data_mem)
+print('Generate Insert Data')
+for layer in write_data:
+    for data in layer:
+        random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
+        data.copy_(random_values)
+print('Insert New data')
+# Insert raw data
+kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
+def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
+    handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
+    if kvc2_ext.is_nullptr(handle):
+        print("Handle is nullptr.")
+        exit()
+    matched_length = kvc2_ext.matched_length(handle)
+    matched_data = kvc2_ext.handle_data(handle)
+    print('Matched length: ', matched_length)
+    if matched_length >0:
+        print(f'First layer address {[hex(x) for x in matched_data[0]]}')
+    read_data = get_tensor_from_data_ptr(matched_data,element_size)
+    for layer_w,layer_r in zip(write_data,read_data):
+        for data_w,data_r in zip(layer_w,layer_r):
+            # print(data_w,data_r)
+            assert torch.equal(data_w,data_r)
+    print("Lookup read check ok.")
+    kvc2_ext.release(handle)
+l = 128
+while l<=length:
+    read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
+    l+=128
+dealloc_aligned_cache(write_data_mem)
+kvc2_ext.save(kvc2_instance)
+kvc2_ext.destroy_kvc2(kvc2_instance)
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/pytest_mem_read.py
+++ b/csrc/balance_serve/kvc2/test/pytest_mem_read.py
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+print("Element size: ", test_info.element_size())
+# Generate random test IDs (length = 2560)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+# print(test_data,test_data_mem)
+print('Generate Insert Data')
+for layer in write_data:
+    for data in layer:
+        random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
+        data.copy_(random_values)
+print('Insert New data')
+# Insert raw data
+kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
+handle = kvc2_ext.lookup(kvc2_instance, test_info, test_id.data_ptr(), length)
+matched_length = kvc2_ext.matched_length(handle)
+matched_data = kvc2_ext.handle_data(handle)
+print('Matched length: ', matched_length)
+print(f'Match data layer {len(matched_data)}')
+print(f'Match layer block count {len(matched_data[0])}')
+read_data = get_tensor_from_data_ptr(matched_data,element_size)
+for layer_w,layer_r in zip(write_data,read_data):
+    for data_w,data_r in zip(layer_w,layer_r):
+        # print(data_w,data_r)
+        assert torch.equal(data_w,data_r)
+print("Lookup read check ok.")
+dealloc_aligned_cache(write_data_mem)
+kvc2_ext.save(kvc2_instance)
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/pytest_raw_insert_and_read.py
+++ b/csrc/balance_serve/kvc2/test/pytest_raw_insert_and_read.py
+import sys
+sys.path.append('./build')
+sys.path.append('./src')
+import torch
+import kvc2_ext
+from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr
+# Create a kvc2 instance
+path = "/mnt/data/kvc2"
+kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
+# Start IO thread
+print("Start IO thread")
+kvc2_ext.start_io_thread(kvc2_instance)
+print("IO thread started")
+# Create CacheInfoInput
+test_info = kvc2_ext.CacheInfoInput()
+test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
+test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
+test_info.quant_type = kvc2_ext.QuantType.QT_F32
+print("Element size: ", test_info.element_size())
+# Generate random test IDs (length = 2560)
+length = 2560
+test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
+block_count = (length+255) // 256
+# print("Test ID: ", test_id)
+# Generate test data based on element size and hidden layer count
+element_size = test_info.element_size()
+hidden_layer_count = test_info.hidden_layer_count()
+write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+# print(test_data,test_data_mem)
+print('Generate Insert Data')
+for layer in write_data:
+    for data in layer:
+        random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
+        data.copy_(random_values)
+print('Insert New data')
+# Insert raw data
+kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
+read_data,read_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
+print('Raw read')
+matched_length = kvc2_ext.raw_read(kvc2_instance, test_info, test_id.data_ptr(), length,get_tensor_ptr(read_data))
+print('Matched length: ', matched_length)
+for layer_w,layer_r in zip(write_data,read_data):
+    for data_w,data_r in zip(layer_w,layer_r):
+        # print(data_w,data_r)
+        assert torch.equal(data_w,data_r)
+print("Raw read check ok.")
+dealloc_aligned_cache(write_data_mem)
+dealloc_aligned_cache(read_data_mem)
+kvc2_ext.save(kvc2_instance)
+print("Test completed successfully.")
--- a/csrc/balance_serve/kvc2/test/test_align.py
+++ b/csrc/balance_serve/kvc2/test/test_align.py
+import ctypes
+import torch
+def aligned_tensor(size, alignment=4096):
+    num_bytes = size 
+    mem = ctypes.c_void_p()
+    error_code = ctypes.CDLL(None).posix_memalign(
+        ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
+    )
+    if error_code != 0:
+        raise MemoryError(f"posix_memalign failed with error code {error_code}")
+    array_type = (ctypes.c_int8 * size) 
+    raw_array = array_type.from_address(mem.value)
+    tensor = torch.frombuffer(raw_array, dtype=torch.int8)
+    if tensor.data_ptr() % alignment != 0:
+        raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
+    return tensor, mem
+size = 5124380
+tensor, mem_ptr = aligned_tensor(size, alignment=4096)
+print(f"Tensor: {tensor}, size: {tensor.size()}, dataptr: {tensor.data_ptr()}")
+print(f"Tensor memory alignment: {tensor.data_ptr() % 4096 == 0}")
+print(f"Allocated memory address: {mem_ptr.value}")
+ctypes.CDLL(None).free(mem_ptr)
--- a/csrc/balance_serve/kvc2/test/test_cuda_stream.cpp
+++ b/csrc/balance_serve/kvc2/test/test_cuda_stream.cpp
+#include <cuda_runtime.h>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+class CudaStreamManager {
+ public:
+  CudaStreamManager(int num_streams);
+  ~CudaStreamManager();
+  // Request structure
+  struct Request {
+    std::vector<void*> host_mem_addresses;
+    std::vector<void*> device_mem_addresses;
+    std::vector<size_t> sizes;
+    cudaMemcpyKind direction;
+    std::function<void()> callback;
+  };
+  void submitRequest(const Request& request);
+ private:
+  int num_streams_;
+  std::vector<cudaStream_t> streams_;
+  int next_stream_index_;
+};
+CudaStreamManager::CudaStreamManager(int num_streams) : num_streams_(num_streams), next_stream_index_(0) {
+  streams_.resize(num_streams_);
+  for (int i = 0; i < num_streams_; ++i) {
+    cudaError_t err = cudaStreamCreate(&streams_[i]);
+    if (err != cudaSuccess) {
+      std::cerr << "Failed to create CUDA stream: " << cudaGetErrorString(err) << std::endl;
+      for (int j = 0; j < i; ++j) {
+        cudaStreamDestroy(streams_[j]);
+      }
+      throw std::runtime_error("Failed to create CUDA stream");
+    }
+  }
+}
+CudaStreamManager::~CudaStreamManager() {
+  for (int i = 0; i < num_streams_; ++i) {
+    cudaStreamDestroy(streams_[i]);
+  }
+}
+void CudaStreamManager::submitRequest(const Request& request) {
+  int stream_index = next_stream_index_;
+  cudaStream_t stream = streams_[stream_index];
+  next_stream_index_ = (next_stream_index_ + 1) % num_streams_;
+  size_t num_transfers = request.host_mem_addresses.size();
+  for (size_t i = 0; i < num_transfers; ++i) {
+    cudaError_t err = cudaMemcpyAsync(request.device_mem_addresses[i], request.host_mem_addresses[i], request.sizes[i],
+                                      request.direction, stream);
+    if (err != cudaSuccess) {
+      std::cerr << "cudaMemcpyAsync failed: " << cudaGetErrorString(err) << std::endl;
+      throw std::runtime_error("cudaMemcpyAsync failed");
+    }
+  }
+  // Enqueue the callback function
+  struct CallbackData {
+    std::function<void()> callback;
+  };
+  CallbackData* cb_data = new CallbackData{request.callback};
+  cudaError_t err = cudaLaunchHostFunc(
+      stream,
+      [](void* data) {
+        CallbackData* cb_data = static_cast<CallbackData*>(data);
+        cb_data->callback();
+        delete cb_data;
+      },
+      cb_data);
+  if (err != cudaSuccess) {
+    std::cerr << "cudaLaunchHostFunc failed: " << cudaGetErrorString(err) << std::endl;
+    throw std::runtime_error("cudaLaunchHostFunc failed");
+  }
+}
+// Example usage
+int main() {
+  try {
+    CudaStreamManager stream_manager(4);  // Create a manager with 4 streams
+    // Prepare host and device memory
+    const size_t num_pages = 10;
+    std::vector<void*> host_mem_addresses(num_pages);
+    std::vector<void*> device_mem_addresses(num_pages);
+    std::vector<size_t> sizes(num_pages, 4096);  // 4KB pages
+    // Allocate host memory
+    for (size_t i = 0; i < num_pages; ++i) {
+      host_mem_addresses[i] = malloc(4096);
+      if (!host_mem_addresses[i]) {
+        throw std::runtime_error("Failed to allocate host memory");
+      }
+      // Initialize data if necessary
+    }
+    // Allocate device memory
+    for (size_t i = 0; i < num_pages; ++i) {
+      cudaError_t err = cudaMalloc(&device_mem_addresses[i], 4096);
+      if (err != cudaSuccess) {
+        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
+        throw std::runtime_error("cudaMalloc failed");
+      }
+    }
+    // Create a request
+    CudaStreamManager::Request request;
+    request.host_mem_addresses = host_mem_addresses;
+    request.device_mem_addresses = device_mem_addresses;
+    request.sizes = sizes;
+    request.direction = cudaMemcpyHostToDevice;
+    request.callback = []() { std::cout << "Data transfer completed!" << std::endl; };
+    // Submit the request
+    stream_manager.submitRequest(request);
+    // Wait for all streams to complete
+    cudaError_t err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize failed: " << cudaGetErrorString(err) << std::endl;
+      throw std::runtime_error("cudaDeviceSynchronize failed");
+    }
+    // Clean up
+    for (size_t i = 0; i < num_pages; ++i) {
+      free(host_mem_addresses[i]);
+      cudaFree(device_mem_addresses[i]);
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Exception: " << e.what() << std::endl;
+    return 1;
+  }
+  return 0;
+}
--- a/csrc/balance_serve/kvc2/test/test_cuda_stream_manager.cpp
+++ b/csrc/balance_serve/kvc2/test/test_cuda_stream_manager.cpp
+#include "cuda_stream_manager.hh"
+#include <cuda_runtime.h>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+int main() {
+  try {
+    int num_devices = 0;
+    cudaError_t err = cudaGetDeviceCount(&num_devices);
+    if (err != cudaSuccess) {
+      std::cerr << "cudaGetDeviceCount failed: " << cudaGetErrorString(err) << std::endl;
+      return 1;
+    }
+    if (num_devices < 1) {
+      std::cerr << "未找到 CUDA 设备。" << std::endl;
+      return 1;
+    }
+    std::vector<size_t> device_ids;
+    for (int i = 0; i < num_devices; ++i) {
+      device_ids.push_back(i);
+    }
+    const size_t num_pages = 10;
+    const size_t page_size = 4096;  // 每页 4KB
+    // 创建 CudaStreamManager 实例，管理所有设备
+    CudaStreamManager stream_manager(device_ids, 4);
+    // 准备主机内存和设备内存映射
+    std::vector<std::vector<void*>> host_mem_addresses(num_devices);
+    std::vector<std::vector<void*>> device_mem_addresses(num_devices);
+    // 分配主机内存
+    for (size_t i = 0; i < num_pages; ++i) {
+      void* host_ptr = malloc(page_size);
+      if (!host_ptr) {
+        throw std::runtime_error("Failed to allocate host memory");
+      }
+      // 如果需要，初始化数据
+      // 将相同的主机内存添加到每个设备的列表中
+      for (int device_id = 0; device_id < num_devices; ++device_id) {
+        host_mem_addresses[device_id].push_back(host_ptr);
+      }
+    }
+    // 为每个设备分配设备内存
+    for (int device_id = 0; device_id < num_devices; ++device_id) {
+      err = cudaSetDevice(device_id);
+      if (err != cudaSuccess) {
+        std::cerr << "cudaSetDevice failed: " << cudaGetErrorString(err) << std::endl;
+        throw std::runtime_error("cudaSetDevice failed");
+      }
+      for (size_t i = 0; i < num_pages; ++i) {
+        void* device_ptr;
+        err = cudaMalloc(&device_ptr, page_size);
+        if (err != cudaSuccess) {
+          std::cerr << "cudaMalloc failed on device " << device_id << ": " << cudaGetErrorString(err) << std::endl;
+          throw std::runtime_error("cudaMalloc failed");
+        }
+        device_mem_addresses[device_id].push_back(device_ptr);
+      }
+    }
+    // 为每个设备创建并提交请求
+    for (int device_id = 0; device_id < num_devices; ++device_id) {
+      auto request = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
+      request->device_id = device_id;
+      request->host_mem_addresses = host_mem_addresses[device_id];
+      request->device_mem_addresses = device_mem_addresses[device_id];
+      request->sizes = std::vector<size_t>(num_pages, page_size);
+      request->direction = cudaMemcpyHostToDevice;
+      request->callback = [device_id]() {
+        std::cout << "Device " << device_id << " data transfer completed!" << std::endl;
+      };
+      stream_manager.submitRequest(request);
+    }
+    // 等待一段时间，确保所有请求都被处理
+    // 在实际应用中，可以使用更好的同步机制
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+    // 清理主机内存
+    for (size_t i = 0; i < num_pages; ++i) {
+      free(host_mem_addresses[0][i]);  // 所有设备共享相同的主机内存，只需释放一次
+    }
+    // 清理设备内存
+    for (int device_id = 0; device_id < num_devices; ++device_id) {
+      err = cudaSetDevice(device_id);
+      if (err != cudaSuccess) {
+        std::cerr << "cudaSetDevice failed during cleanup: " << cudaGetErrorString(err) << std::endl;
+        continue;
+      }
+      for (void* ptr : device_mem_addresses[device_id]) {
+        cudaFree(ptr);
+      }
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "异常: " << e.what() << std::endl;
+    return 1;
+  }
+  return 0;
+}