Merge branch 'kvcache-ai:main' into main

877aec85 · Yuhao Tsui · GitHub · 84164f58 · 9037bf30 · 877aec85
Unverified Commit 877aec85 authored Apr 09, 2025 by Yuhao Tsui Committed by GitHub Apr 09, 2025
20 changed files
--- a/csrc/balance_serve/kvc2/src/cache_entry.hh
+++ b/csrc/balance_serve/kvc2/src/cache_entry.hh
+#ifndef __CACHE_ENTRY_HH_
+#define __CACHE_ENTRY_HH_
+#include "async_store.hh"
+#include "cuda_stream_manager.hh"
+#include "defs.h"
+#include "hasher.hpp"
+#include "io_helper.hpp"
+#include "page_aligned_memory_pool.h"
+#include "utils/periodic_task.hpp"
+#include <atomic>
+#include <list>
+#include <memory>
+#include "utils/mutex_extend.hpp"
+namespace kvc2 {
+using CacheBlockKey = TokensHash;
+class CacheEntryManager;
+struct DoubleVerticalBlocksHandle;
+class GPUPageCache;
+struct ConcurrentControlUnit {
+  std::atomic_size_t ref_count = 0;
+  std::atomic_bool dirty = false;
+  TransferControl<std::mutex> tc;
+  bool can_desert();
+  void debug();
+};
+enum IOOption {
+  IO_ForceRead,
+  IO_ForceWrite,
+  IO_Read,
+  IO_Write,
+};
+inline std::string to_string(IOOption op) {
+  switch (op) {
+    case IO_ForceRead:
+      return "IO_ForceRead";
+    case IO_ForceWrite:
+      return "IO_ForceWrite";
+    case IO_Read:
+      return "IO_Read";
+    case IO_Write:
+      return "IO_Write";
+    default:
+      return "Unknown";
+  }
+}
+struct CacheBlockEntry {
+  friend CacheEntryManager;
+  using MutexT = non_recursive_mutex;
+  // using MutexT = std::mutex;
+  MutexT lock;
+  // for cache
+  bool with_key = true;
+  CacheBlockKey hash = 0;
+  CacheBlockKey hash_check = 0;
+  CacheInfo cache_info;
+  CacheEntryManager* manager = nullptr;
+  // for memory pool
+  void* data = nullptr;
+  size_t size = 0;
+  ConcurrentControlUnit cpu_cc;
+  // for disk
+  size_t layer = -1;
+  size_t idx = -1;
+  // for gpu
+  std::optional<size_t> gpu_block_idx = std::nullopt;
+  ConcurrentControlUnit gpu_cc;
+  CacheBlockEntry() =default;
+  CacheBlockEntry(const CacheBlockEntry& other) = delete;
+  CacheBlockEntry& operator=(const CacheBlockEntry& other) = delete;
+  CacheBlockEntry(CacheBlockEntry&& other) = delete;
+  CacheBlockEntry& operator=(CacheBlockEntry&& other) = delete;
+  ~CacheBlockEntry();
+ private:
+  bool alloc_on_cpu();
+ public:
+  void free_on_cpu();
+  bool alloc_on_cpu_no_lock();
+  bool inc_ref_or_alloc_on_cpu();
+  void set_key(TokensHash key, std::shared_ptr<CacheBlockEntry> me);
+  std::unique_lock<MutexT> try_lock();
+  std::lock_guard<MutexT> lock_guard();
+  // will not get lock
+  void io_with(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper, async_store::ArrayStore* store,
+               size_t layer, size_t index, IOOption option);
+  void flush_back_async(IO_Helper<CacheBlockEntry>& helper, std::vector<std::atomic_bool*>& dirty_flags);
+  void debug();
+};
+struct CacheBlockEntryCollector{
+  std::vector<CacheBlockEntry*> entries;
+  std::function<void(CacheBlockEntry*)> exit_fn;
+  CacheBlockEntryCollector(std::function<void(CacheBlockEntry*)> exit_fn);
+  ~CacheBlockEntryCollector();
+  CacheBlockEntryCollector(const CacheBlockEntryCollector& other) = delete;
+  CacheBlockEntryCollector(CacheBlockEntryCollector&& other) = delete;
+  CacheBlockEntryCollector& operator=(const CacheBlockEntryCollector& other) = delete;
+  CacheBlockEntryCollector& operator=(CacheBlockEntryCollector&& other) = delete;
+};
+struct KVC2;
+struct CacheEntryManagerConfig {
+  size_t evict_count = 100;
+  KVC2* kvc2_top = nullptr;
+};
+class CacheEntryManager {
+ public:
+  using Key = CacheBlockKey;
+  using BlockPtr = std::shared_ptr<CacheBlockEntry>;
+ private:
+  friend CacheBlockEntry;
+  CacheEntryManagerConfig config;
+  std::mutex lock;
+  std::list<BlockPtr> usage_list;
+  std::unordered_map<Key, std::list<BlockPtr>::iterator> key_entry_map;
+  void insert(BlockPtr entry);
+  BlockPtr access(const Key& key);
+  // void remove(const Key& key);
+  void evict(std::function<bool(const BlockPtr&)> filter, std::function<bool()> stop_condition);
+ public:
+  std::unique_ptr<periodic::PeriodicTask> background_flush_back=nullptr;
+  std::shared_ptr<PageAlignedMemoryPool> pool;
+  std::shared_ptr<GPUPageCache> gpu_cache;
+  CacheEntryManager(CacheEntryManagerConfig config);
+  // disable all move and copy
+  CacheEntryManager(const CacheEntryManager& other) = delete;
+  CacheEntryManager& operator=(const CacheEntryManager& other) = delete;
+  CacheEntryManager(CacheEntryManager&& other) = delete;
+  CacheEntryManager& operator=(CacheEntryManager&& other) = delete;
+  void cpu_background_flush();
+  void evict_for_cpu_cache();
+  // just get block pointers, not allocate them, will not return nullptr
+  BlockPtr get(bool& is_new,size_t size, std::optional<Key> key = std::nullopt);
+  void debug();
+};
+}  // namespace kvc2
+#endif
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/common.h
+++ b/csrc/balance_serve/kvc2/src/common.h
--- a/csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
+++ b/csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
+#include "cuda_stream_manager.hh"
+#include <cuda_runtime.h>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+CudaStreamManager::CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device) {
+  for (int device_id : device_ids) {
+    auto x = std::unique_ptr<DeviceInfo>(new DeviceInfo);
+    DeviceInfo& device_info = *x;
+    device_info.device_id = device_id;
+    device_info.next_stream_index = 0;
+    device_info.stop_flag = false;
+    // 设置设备
+    cudaError_t err = cudaSetDevice(device_id);
+    if (err != cudaSuccess) {
+      SPDLOG_WARN("cudaSetDevice failed on device {}: {}", device_id, cudaGetErrorString(err));
+      throw std::runtime_error("cudaSetDevice failed");
+    }
+    // 创建 CUDA 流
+    device_info.streams.resize(num_streams_per_device);
+    for (int i = 0; i < num_streams_per_device; ++i) {
+      err = cudaStreamCreate(&device_info.streams[i]);
+      if (err != cudaSuccess) {
+        SPDLOG_WARN("Failed to create CUDA stream on device {}: {}", device_id, cudaGetErrorString(err));
+        throw std::runtime_error("Failed to create CUDA stream");
+      }
+    }
+    // 启动设备工作线程
+    device_info.worker_thread = std::thread(&CudaStreamManager::deviceWorker, this, std::ref(device_info));
+    devices_.push_back(std::move(x));
+  }
+}
+CudaStreamManager::~CudaStreamManager() {
+  // 通知所有设备线程停止
+  for (auto& device_info : devices_) {
+    device_info->stop_flag.store(true);
+    auto request = std::shared_ptr<Request>(new Request);
+    request->should_exit = true;
+    device_info->request_queue.enqueue(std::move(request));
+  }
+  // 等待所有线程结束
+  for (auto& device_info : devices_) {
+    if (device_info->worker_thread.joinable()) {
+      device_info->worker_thread.join();
+    }
+    // 销毁 CUDA 流
+    cudaSetDevice(device_info->device_id);
+    for (auto& stream : device_info->streams) {
+      cudaStreamDestroy(stream);
+    }
+  }
+}
+void CudaStreamManager::submitRequest(std::shared_ptr<Request> request) {
+  // 找到对应的设备
+  for (auto& device_info : devices_) {
+    if (device_info->device_id == request->device_id) {
+      device_info->request_queue.enqueue(request);
+      return;
+    }
+  }
+  throw std::runtime_error("Invalid device ID in request");
+}
+void CudaStreamManager::deviceWorker(DeviceInfo& device_info) {
+  // 设置设备
+  cudaError_t err = cudaSetDevice(device_info.device_id);
+  if (err != cudaSuccess) {
+    SPDLOG_WARN("cudaSetDevice failed in worker thread for device {}: {}", device_info.device_id,
+                cudaGetErrorString(err));
+    return;
+  }
+  while (device_info.stop_flag.load() == false) {
+    auto request = device_info.request_queue.dequeue();
+    if (request->should_exit) {
+      return;
+    }
+    // 处理请求
+    SPDLOG_DEBUG("Getting request on device {}, count {}", device_info.device_id, request->host_mem_addresses.size());
+    int stream_index = device_info.next_stream_index;
+    cudaStream_t stream = device_info.streams[stream_index];
+    device_info.next_stream_index = (device_info.next_stream_index + 1) % device_info.streams.size();
+    size_t num_transfers = request->host_mem_addresses.size();
+    for (size_t i = 0; i < num_transfers; ++i) {
+      void* dst = request->device_mem_addresses[i];
+      void* src = request->host_mem_addresses[i];
+      if (request->direction == cudaMemcpyDeviceToHost) {
+        std::swap(dst, src);
+      }
+      cudaError_t err = cudaMemcpyAsync(dst, src, request->sizes[i], request->direction, stream);
+      if (err != cudaSuccess) {
+        SPDLOG_WARN("cudaMemcpyAsync failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
+        // 可以根据需要处理错误，这里简单地继续
+        continue;
+      }
+    }
+    // 添加回调函数，因为是异步，所以需要包起来
+    struct CallbackData {
+      std::function<void()> callback;
+    };
+    CallbackData* cb_data = new CallbackData{request->callback};
+    err = cudaLaunchHostFunc(
+        stream,
+        [](void* data) {
+          // SPDLOG_DEBUG("Callback function called");
+          CallbackData* cb_data = static_cast<CallbackData*>(data);
+          cb_data->callback();
+          delete cb_data;
+        },
+        cb_data);
+    if (err != cudaSuccess) {
+      SPDLOG_WARN("cudaLaunchHostFunc failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
+      // 根据需要处理错误
+    }
+  }
+}
--- a/csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
+++ b/csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
+/*
+ * @Author: Xie Weiyu ervinxie@qq.com
+ * @Date: 2024-11-19 09:24:47
+ * @LastEditors: Xie Weiyu ervinxie@qq.com
+ * @LastEditTime: 2024-11-20 02:55:49
+ * @FilePath: /kvc2/src/cuda_stream_manager.hh
+ * @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
+ */
+#pragma once
+#include <cuda_runtime.h>
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <thread>
+#include <vector>
+#include "utils/mpsc.hpp"
+class CudaStreamManager {
+ public:
+  // 构造函数，接受要使用的设备 ID 列表和每个设备的流数量
+  CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device);
+  ~CudaStreamManager();
+  // 请求结构体
+  struct Request {
+    bool should_exit = false;
+    int device_id;
+    std::vector<void*> host_mem_addresses;
+    std::vector<void*> device_mem_addresses;
+    std::vector<size_t> sizes;
+    cudaMemcpyKind direction;
+    std::function<void()> callback;
+  };
+  void submitRequest(std::shared_ptr<Request> request);
+ private:
+  // 每个设备的信息
+  struct DeviceInfo {
+    int device_id;
+    std::thread worker_thread;
+    std::vector<cudaStream_t> streams;
+    int next_stream_index;
+    MPSCQueueConsumerLock<std::shared_ptr<Request>> request_queue;
+    std::atomic_bool stop_flag;
+  };
+  // 设备 ID 到 DeviceInfo 的映射
+  std::vector<std::unique_ptr<DeviceInfo>> devices_;
+  // 私有方法
+  void deviceWorker(DeviceInfo& device_info);
+};
--- a/csrc/balance_serve/kvc2/src/defs.h
+++ b/csrc/balance_serve/kvc2/src/defs.h
+#ifndef __DEFS_H_
+#define __DEFS_H_
+#include <cstdint>
+#include <optional>
+#include <vector>
+#include "model_config.h"
+namespace kvc2 {
+using kvc2_ptr = void*;
+// using data_block_ptr = std::intptr_t;
+using data_block_ptr = void*;
+using layer_data = std::vector<data_block_ptr>;
+using kvc2_handle = void*;
+using Token = uint32_t;
+using Tokens = std::vector<Token>;
+using TokenPtr = std::intptr_t;
+using TokenLength = size_t;
+using BlockLength = size_t;
+struct CacheInfo {
+  ModelName model_name;
+  bool is_key_cache;
+  QuantType quant_type;
+  size_t hidden_layer_count();
+  std::filesystem::path path(std::optional<size_t> which_layer = std::nullopt);
+  bool operator==(const CacheInfo& other) const;
+  size_t element_size(size_t block_length);
+  size_t hash_value() const;
+};
+};  // namespace kvc2
+#endif
--- a/csrc/balance_serve/kvc2/src/gpu_cache.cpp
+++ b/csrc/balance_serve/kvc2/src/gpu_cache.cpp
+#include "gpu_cache.hh"
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+#include "cache_entry.hh"
+#include "utils/arithmetic.hpp"
+namespace kvc2 {
+GPUPageCache::GPUPageCache(GPUPageCacheConfig& config) : config(config) {
+  if (torch::cuda::is_available()) {
+    size_t gpu_count = torch::cuda::device_count();
+    SPDLOG_INFO("Number of available GPUs: {}, want {}", gpu_count, config.gpu_devices_id.size());
+    if (gpu_count < config.gpu_devices_id.size()) {
+      SPDLOG_ERROR("Not enough GPUs available.");
+      exit(0);
+    }
+    for (auto x : config.gpu_devices_id) {
+      gpu_devices.push_back(torch::Device(torch::kCUDA, x));
+    }
+  } else {
+    SPDLOG_ERROR("CUDA is not available on this system.");
+    exit(0);
+  }
+  SPDLOG_WARN("Creating GPU Cache");
+  shape.push_back(config.layer_count);
+  shape.push_back(config.total_kvcache_pages);
+  shape.push_back(config.num_token_per_page);
+  if (config.full_kv_cache_on_each_gpu) {
+    if (config.gpu_devices_id.size() > 1) {
+      SPDLOG_WARN("Replicated KVCache on multiple gpu");
+    }
+    shape.push_back(config.num_k_heads);
+  } else {
+    shape.push_back(config.num_k_heads / config.gpu_devices_id.size());
+  }
+  shape.push_back(config.k_head_dim);
+  tensor_size = torch::elementSize(config.tensor_type);
+  for (auto& s : shape) {
+    tensor_size *= s;
+  }
+  SPDLOG_INFO("Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB", shape[0], shape[1], shape[2], shape[3],
+              shape[4], tensor_size / (1 << 20));
+  if (config.k_cache_on) {
+    for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
+      auto k = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
+      k = k.to(gpu_devices[i]);
+      k_cache.push_back(k);
+      SPDLOG_INFO("K Page Cache of GPU {} is created", config.gpu_devices_id[i]);
+    }
+    occupations.resize(config.layer_count);
+  } else {
+    SPDLOG_WARN("Disalbe K Cache");
+    assert(config.gpu_only);
+  }
+  if (config.v_cache_on) {
+    for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
+      auto v = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
+      v = v.to(gpu_devices[i]);
+      v_cache.push_back(v);
+      SPDLOG_INFO("V Page Cache of GPU {} is created", config.gpu_devices_id[i]);
+    }
+    v_occupations.resize(config.layer_count);
+  } else {
+    SPDLOG_WARN("Disalbe V Cache");
+    // assert(config.gpu_only); // should not assert
+  }
+  if (config.gpu_only) {
+    gpu_only_occupations.resize(config.total_kvcache_pages, false);
+  }
+  num_free_pages = config.total_kvcache_pages;
+  for (size_t i = 0; i < config.layer_count; i++) {
+    if (config.k_cache_on)
+      occupations[i].resize(config.total_kvcache_pages, nullptr);
+    if (config.v_cache_on)
+      v_occupations[i].resize(config.total_kvcache_pages, nullptr);
+  }
+  tp_size.resize(config.gpu_devices_id.size(), shape[2] * shape[3] * shape[4] * c10::elementSize(config.tensor_type));
+  tp_offset.resize(config.gpu_devices_id.size(), 0);
+  for (size_t i = 1; i < tp_offset.size(); i++) {
+    tp_offset[i] = tp_offset[i - 1] + tp_size[i - 1];
+  }
+  stream_manager =
+      std::unique_ptr<CudaStreamManager>(new CudaStreamManager(config.gpu_devices_id, config.num_streams_per_device));
+}
+bool GPUPageCache::alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
+                             std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at) {
+  std::lock_guard<std::mutex> lg(lock);
+  auto idx = next_empty_col();
+  if (idx.has_value()) {
+    // must have entry lock
+    auto& k0_entry = k_entries[0][at];
+    k0_entry->gpu_block_idx = idx;
+    for (size_t l = 0; l < config.layer_count; l++) {
+      if (config.k_cache_on) {
+        assert(k_entries[l][at]->data != nullptr);
+        occupations[l][idx.value()] = k_entries[l][at];
+      }
+      if (config.v_cache_on) {
+        assert(v_entries[l][at]->data != nullptr);
+        v_occupations[l][idx.value()] = v_entries[l][at];
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+std::vector<size_t> GPUPageCache::gpu_only_alloc_col(size_t count) {
+  assert(config.gpu_only);
+  std::lock_guard<std::mutex> lg(lock);
+  std::vector<size_t> re;
+  for (size_t i = 0; i < config.total_kvcache_pages; i++) {
+    if (gpu_only_occupations[i] == false) {
+      re.push_back(i);
+      if (re.size() == count) {
+        break;
+      }
+    }
+  }
+  if (re.size() == count) {
+    for (auto at : re) {
+      gpu_only_occupations[at] = true;
+    }
+  } else {
+    SPDLOG_WARN("GPU ONLY: Cannot allocate {} cols", count);
+    re.clear();
+  }
+  return re;
+}
+void GPUPageCache::gpu_only_free_cols(std::vector<size_t> cols) {
+  assert(config.gpu_only);
+  std::lock_guard<std::mutex> lg(lock);
+  for (auto at : cols) {
+    assert(gpu_only_occupations[at]);
+    gpu_only_occupations[at] = false;
+  }
+}
+std::optional<size_t> GPUPageCache::next_empty_col() {
+  if (num_free_pages == 0) {
+    evict_cols();
+    if (num_free_pages == 0) {
+      return std::nullopt;
+    }
+  }
+  while (occupations[0][_col_idx] != nullptr) {
+    _col_idx = (_col_idx + 1) % config.total_kvcache_pages;
+  }
+  num_free_pages -= 1;
+  return _col_idx;
+}
+void GPUPageCache::evict_cols() {
+  auto evicted_count = 0;
+  for (size_t i = 0; i < config.total_kvcache_pages; i++) {
+    auto& h = occupations[0][i];
+    if (h == nullptr) {
+      continue;
+    }
+    auto lg = h->lock_guard();
+    if (h->gpu_cc.can_desert()) {
+      h->gpu_cc.tc.reset();
+      h = nullptr;
+      num_free_pages += 1;
+      evicted_count += 1;
+    }
+  }
+  if (evicted_count > 0)
+    SPDLOG_INFO("GPU: Evicted {} GPU pages", evicted_count);
+}
+std::vector<std::unique_lock<CacheBlockEntry::MutexT>> GPUPageCache::try_lock_col(size_t at) {
+  std::vector<std::unique_lock<CacheBlockEntry::MutexT>> re;
+  if (config.k_cache_on) {
+    for (size_t l = 0; l < config.layer_count; l++) {
+      if (occupations[l][at] == nullptr) {
+        return {};
+      }
+      auto ul = occupations[l][at]->try_lock();
+      if (ul.owns_lock()) {
+        re.push_back(std::move(ul));
+      } else {
+        return {};
+      }
+    }
+  }
+  if (config.v_cache_on) {
+    for (size_t l = 0; l < config.layer_count; l++) {
+      if (v_occupations[l][at] == nullptr) {
+        return {};
+      }
+      auto ul = v_occupations[l][at]->try_lock();
+      if (ul.owns_lock()) {
+        re.push_back(std::move(ul));
+      } else {
+        return {};
+      }
+    }
+  }
+  return re;
+}
+std::vector<std::shared_ptr<CudaStreamManager::Request>> GPUPageCache::basic_request(cudaMemcpyKind direction,
+                                                                                     std::function<void()> callback) {
+  std::vector<std::shared_ptr<CudaStreamManager::Request>> re;
+  re.resize(config.gpu_devices_id.size(), nullptr);
+  for (size_t i = 0; i < re.size(); i++) {
+    re[i] = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
+    re[i]->direction = direction;
+    re[i]->device_id = config.gpu_devices_id[i];
+    re[i]->callback = callback;
+  }
+  return re;
+}
+void GPUPageCache::submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs) {
+  for (auto& r : reqs) {
+    stream_manager->submitRequest(r);
+  }
+}
+void GPUPageCache::append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
+                                         std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
+                                         std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles,
+                                         size_t at) {
+  if (config.k_cache_on == false && config.v_cache_on == false) {
+    return;
+  }
+  auto gpu_block_idx = k_handles[0][at]->gpu_block_idx.value();
+  for (size_t layer = 0; layer < config.layer_count; layer++) {
+    for (size_t which_gpu = 0; which_gpu < config.gpu_devices_id.size(); which_gpu++) {
+      if (config.k_cache_on) {
+        assert(k_handles[layer][at]->data != nullptr);
+        reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
+        reqs[which_gpu]->host_mem_addresses.push_back(
+            offset_by_bytes(k_handles[layer][at]->data, tp_offset[which_gpu]));
+        reqs[which_gpu]->device_mem_addresses.push_back(k_cache[which_gpu][layer][gpu_block_idx].data_ptr());
+      }
+      if (config.v_cache_on) {
+        assert(v_handles[layer][at]->data != nullptr);
+        reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
+        reqs[which_gpu]->host_mem_addresses.push_back(
+            offset_by_bytes(v_handles[layer][at]->data, tp_offset[which_gpu]));
+        reqs[which_gpu]->device_mem_addresses.push_back(v_cache[which_gpu][layer][gpu_block_idx].data_ptr());
+      }
+    }
+  }
+  // SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
+}
+void GPUPageCache::debug() {
+  size_t count = 0;
+  for (size_t i = 0; i < config.total_kvcache_pages; i++) {
+    if (occupations[0][i] == nullptr) {
+      count += 1;
+    } else {
+      // occupations[0][i]->gpu_cc.debug();
+    }
+  }
+  SPDLOG_DEBUG("Free Page: {}/{}", count, config.total_kvcache_pages);
+}
+}  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/gpu_cache.hh
+++ b/csrc/balance_serve/kvc2/src/gpu_cache.hh
+#ifndef __GPU_CACHE_HH_
+#define __GPU_CACHE_HH_
+#include <torch/torch.h>
+#include "cache_entry.hh"
+#include "cuda_stream_manager.hh"
+#include "defs.h"
+#include "kvc2.h"
+#include "metrics.h"
+#include "utils/periodic_task.hpp"
+namespace kvc2 {
+class GPUPageCache {
+  std::vector<torch::Device> gpu_devices;
+  std::vector<int64_t> shape;
+  size_t tensor_size;
+  std::vector<size_t> tp_offset;
+  std::vector<size_t> tp_size;
+  // met
+  std::shared_ptr<Metrics> met;
+  // states
+  std::mutex lock;
+  size_t num_free_pages;
+  std::vector<bool> gpu_only_occupations;
+  std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>> occupations,v_occupations;
+  size_t _col_idx = 0;
+  // cuda stream manager
+  std::optional<size_t> next_empty_col();
+ public:
+  GPUPageCacheConfig config;
+  std::unique_ptr<CudaStreamManager> stream_manager;
+  std::vector<torch::Tensor> k_cache;
+  std::vector<torch::Tensor> v_cache;
+  std::unique_ptr<periodic::PeriodicTask> background_flush_back =nullptr;
+  GPUPageCache(GPUPageCacheConfig& config);
+  std::vector<size_t> gpu_only_alloc_col(size_t count);
+  void gpu_only_free_cols(std::vector<size_t> cols);
+  void gpu_background_flush();
+  bool alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
+                 std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at);
+  void evict_cols();
+  void flush_col(size_t at);
+  std::vector<std::unique_lock<CacheBlockEntry::MutexT>> try_lock_col(size_t at);
+  void free_col(size_t at);
+  std::vector<std::shared_ptr<CudaStreamManager::Request>> basic_request(cudaMemcpyKind direction,
+                                                                         std::function<void()> callback);
+  void submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs);
+  void append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
+                             std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
+                             std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles, size_t at);
+  void debug();
+};
+}  // namespace kvc2
+#endif
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/hasher.hpp
+++ b/csrc/balance_serve/kvc2/src/hasher.hpp
+#ifndef __HASHER_HPP_
+#define __HASHER_HPP_
+#include "defs.h"
+#include "xxhash.h"
+namespace kvc2 {
+const uint64_t hash_seed = 4123512;
+const uint64_t check_hash_seed = 1025753;
+using TokensHash = XXH64_hash_t;
+struct TokensHasher {
+  XXH64_state_t* state;
+  TokensHasher() {
+    state = XXH64_createState();
+    reset();
+  }
+  ~TokensHasher() { XXH64_freeState(state); }
+  TokensHasher(TokensHasher& other) = delete;
+  TokensHasher& operator=(TokensHasher& other) = delete;
+  TokensHasher(TokensHasher&& other) = delete;
+  TokensHasher& operator=(TokensHasher&& other) = delete;
+  TokensHash get() { return XXH64_digest(state); }
+  void reset(size_t seed = hash_seed) { XXH64_reset(state, seed); }
+  TokensHash update(Token* data, TokenLength length) {
+    XXH64_update(state, data, length * sizeof(Token));
+    return get();
+  }
+  TokensHash update_raw(void* data, size_t size) {
+    XXH64_update(state, data, size);
+    return get();
+  }
+  static TokensHash hash(Token* data, TokenLength length) { return XXH64(data, length * sizeof(Token), hash_seed); }
+};
+}  // namespace kvc2
+#endif
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/io_helper.hpp
+++ b/csrc/balance_serve/kvc2/src/io_helper.hpp
+/**
+ * @Description  :
+ * @Author       : Xie Weiyu
+ * @Date         : 2024-12-11 06:35:31
+ * @Version      : 1.0.0
+ * @LastEditors  : Xie Weiyu
+ * @LastEditTime : 2024-12-11 06:50:55
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#pragma once
+#include <atomic>
+#include <future>
+#include <iostream>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <vector>
+struct BatchPromise {
+  std::promise<void> promise;
+  std::shared_future<void> fut;
+  std::atomic_size_t count;
+  inline BatchPromise(size_t count) : count(count) { fut = promise.get_future().share(); }
+  inline void inc(size_t count = 1) { this->count.fetch_add(count, std::memory_order_seq_cst); }
+  inline void set() {
+    if (count.fetch_sub(1, std::memory_order_seq_cst) == 1) {
+      promise.set_value();
+    }
+  }
+  inline std::shared_future<void> get_shared_fut() { return fut; }
+};
+template <typename Lock>
+struct TransferControl {
+  Lock lock;
+  std::optional<std::shared_future<void>> transfer_ok = std::nullopt;
+  bool has_data = false;
+  TransferControl() {}
+  /*
+   true, std::nullopt : Already has data
+   false, shared_future : Transfer already started, should wait for the future
+   false, std::nullopt : should transfer by you
+   true, shared_future: Should not appear
+  */
+  std::pair<bool, std::optional<std::shared_future<void>>> has_data_or_transfer(std::shared_future<void> shared_fut) {
+    std::lock_guard<Lock> lg(lock);
+    if (has_data) {
+      return {true, std::nullopt};
+    } else {
+      if (transfer_ok.has_value()) {
+        return {false, transfer_ok};
+      } else {
+        transfer_ok = shared_fut;
+        return {false, std::nullopt};
+      }
+    }
+  }
+  void set_has_data() {
+    std::lock_guard<Lock> lg(lock);
+    has_data = true;
+    transfer_ok = std::nullopt;
+  }
+  bool get_has_data() {
+    std::lock_guard<Lock> lg(lock);
+    if (has_data) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  void reset() {
+    std::lock_guard<Lock> lg(lock);
+    transfer_ok = std::nullopt;
+    has_data = false;
+  }
+  std::string debug() {
+    std::lock_guard<Lock> lg(lock);
+    return std::string("") + (has_data ? "has data" : "no data") + " " +
+           (transfer_ok.has_value() ? "transfer " : "no transfer");
+  }
+};
+struct ConcurrentController {
+  std::atomic_bool dirty = false;
+  std::atomic_size_t ref_count = 0;
+  TransferControl<std::mutex> tc;
+};
+template <typename Unit>
+struct IO_Helper {
+  BatchPromise batch_promise;
+  std::function<void(Unit*)> call_back_on_unit = nullptr;
+  std::function<void()> call_back = nullptr;
+  std::vector<std::shared_future<void>> futs;
+  std::vector<Unit*> units_by_myself;
+  IO_Helper(std::function<void(Unit*)> call_back_on_unit, std::function<void()> call_back = nullptr)
+      : batch_promise(1), call_back_on_unit(call_back_on_unit), call_back(call_back) {}
+  IO_Helper(const IO_Helper& other) = delete;
+  IO_Helper& operator=(const IO_Helper& other) = delete;
+  IO_Helper(IO_Helper&& other) = delete;
+  IO_Helper& operator=(IO_Helper&& other) = delete;
+  ~IO_Helper() {
+    // std::cout<<"Destory IO helper"<<std::endl;
+  }
+  size_t total_task_count = 0;
+  void new_task(size_t count = 1) {
+    total_task_count += 1;
+    batch_promise.inc(count);
+  }
+  void finish_add_taks() { batch_promise.set(); }
+  bool absorb_tc(Unit* unit, TransferControl<std::mutex>& tc) {
+    auto [ok, fut] = tc.has_data_or_transfer(batch_promise.get_shared_fut());
+    if (ok) {
+      return false;
+    } else {
+      if (fut.has_value()) {
+        futs.push_back(fut.value());
+        // printf("Transfer started\n");
+        return false;
+      } else {
+        units_by_myself.push_back(unit);
+        // printf("Not Transfer\n");
+        return true;
+      }
+    }
+  }
+  void wait() {
+    for (auto& fut : futs) {
+      fut.wait();
+    }
+    batch_promise.get_shared_fut().wait();
+    for (auto& b : units_by_myself) {
+      call_back_on_unit(b);
+    }
+    if (call_back)
+      call_back();
+  }
+};
--- a/csrc/balance_serve/kvc2/src/kvc2.h
+++ b/csrc/balance_serve/kvc2/src/kvc2.h
+#pragma once
+#include <torch/torch.h>
+#include <cstdint>
+#include <optional>
+#include <vector>
+#include "defs.h"
+#include "model_config.h"
+namespace kvc2 {
+struct GPUPageCacheConfig {
+  bool gpu_only;
+  std::vector<size_t> gpu_devices_id;
+  size_t layer_count;
+  size_t total_kvcache_pages;
+  size_t num_token_per_page;
+  size_t num_k_heads;
+  size_t k_head_dim;
+  bool full_kv_cache_on_each_gpu = false;
+  bool k_cache_on = true;
+  bool v_cache_on = true;
+  torch::ScalarType tensor_type;
+  // for cuda stream manager
+  size_t num_streams_per_device = 4;
+};
+struct KVC2Config {
+  bool k_cache_on = true;
+  bool v_cache_on = true;
+  bool gpu_only = false;
+  bool load_from_disk = true;
+  bool save_to_disk = true;
+  std::string path;
+  std::string config_path;
+  TokenLength num_token_per_page = 256;
+  size_t memory_pool_size = 10e9;
+  size_t evict_count = 20;
+  std::optional<GPUPageCacheConfig> gpu_cache_config = std::nullopt;
+  size_t metrics_port;
+  double recompute_ratio = 0.2;
+};
+class DoubleCacheHandleInterface;
+class KVC2Interface {
+ public:
+  virtual ~KVC2Interface() = default;
+  virtual void load() = 0;
+  virtual void save() = 0;
+  /*
+Raw Insert
+Insert kvcache from kvcache_data to disk.
+info: cache info
+id: start pointer of token array
+length: length of token array
+kvcache_data: data of kvcache
+This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
+*/
+  virtual void raw_insert(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                          const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
+  /*
+Raw Read
+Read kvcache from disk to user specified pointers.
+info: cache info
+id: start pointer of token array
+length: length of token array
+kvcache_data: data of kvcache
+Return:  matched length of prefix, in tokens
+This will not read from memory pool, it directly read from disk.
+*/
+  virtual TokenLength raw_read(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                               const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
+  /*
+  Lookup
+  Lookup kvcache and load it from disk to memory pool if needed.
+  info: cache info
+  id: start pointer of token array
+  length: length of token array
+  Return:  kvc2_handle, holds kvcache until being released.
+           if not found, matched_length will return 0.
+           if memory pool is full, return nullptr
+  */
+  virtual std::shared_ptr<DoubleCacheHandleInterface> lookup(ModelName model_name, QuantType quant_type, Token* id,
+                                                             TokenLength length, TokenLength estimated_length) = 0;
+  /*
+  Lookup and allocate to gpu
+  info.is_k_cache does not matter here
+  */
+  virtual std::shared_ptr<DoubleCacheHandleInterface> lookup_to_gpu(ModelName model_name, QuantType quant_type,
+                                                                    Token* id, TokenLength length,
+                                                                    TokenLength estimated_length) = 0;
+  virtual void lookup_to_gpu_async(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                                   TokenLength estimated_length,
+                                   std::function<void(std::shared_ptr<DoubleCacheHandleInterface>)> call_back) = 0;
+  virtual std::pair<std::vector<torch::Tensor>, std::vector<torch::Tensor>> get_kvcache() = 0;
+  virtual void debug() = 0;
+};
+std::shared_ptr<KVC2Interface> create_kvc2(KVC2Config config);
+enum MatchStatus {
+  Exact,
+  Partial,
+  NotMatchExact,
+  NotMatchPartial,
+};
+class DoubleCacheHandleInterface {
+ public:
+  virtual ~DoubleCacheHandleInterface() = default;
+  virtual TokenLength matched_length() = 0;
+  virtual std::vector<MatchStatus> matched_status() = 0;
+  virtual std::vector<layer_data> handle_data(bool is_key_cache) = 0;
+  virtual bool to_gpu() = 0;
+  virtual void to_gpu_async(std::function<void(bool)> call_back) = 0;
+  virtual std::vector<size_t> get_gpu_block_idx() = 0;
+  virtual std::vector<size_t> get_gpu_attached_block_idx() = 0;
+  virtual void append_tokens(Token* tokens, TokenLength length) = 0;  // update generated tokens
+  virtual void debug() = 0;
+};
+};  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/kvc2_utils.py
+++ b/csrc/balance_serve/kvc2/src/kvc2_utils.py
+import torch
+import ctypes
+def aligned_tensor(size, alignment=4096):
+    num_bytes = size 
+    mem = ctypes.c_void_p()
+    error_code = ctypes.CDLL(None).posix_memalign(
+        ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
+    )
+    if error_code != 0:
+        raise MemoryError(f"posix_memalign failed with error code {error_code}")
+    array_type = (ctypes.c_int8 * size) 
+    raw_array = array_type.from_address(mem.value)
+    tensor = torch.frombuffer(raw_array, dtype=torch.int8)
+    if tensor.data_ptr() % alignment != 0:
+        raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
+    return tensor, mem
+def alloc_aligned_cache(layer_count,block_count,element_size):
+    cache = []
+    cache_mem = []
+    for i in range(layer_count):
+        layer_data = []
+        layer_mem = []
+        for j in range(block_count):
+            tensor, mem_ptr = aligned_tensor(element_size, alignment=4096)
+            layer_data.append(tensor)
+            layer_mem.append(mem_ptr)
+        cache.append(layer_data)
+        cache_mem.append(layer_mem)
+    return cache,cache_mem
+def dealloc_aligned_cache(cache_mem):
+    for layer_mem in cache_mem:
+        for mem_ptr in layer_mem:
+            ctypes.CDLL(None).free(mem_ptr)
+def get_tensor_ptr(tensors):
+    tensor_ptr = []
+    for layer in tensors:
+        layer_ptr = []
+        for data in layer:
+            layer_ptr.append(data.data_ptr())
+        tensor_ptr.append(layer_ptr)
+    return tensor_ptr
+def get_tensor_from_data_ptr(matched_data,element_size):
+    re = []
+    for layer in matched_data:
+        re_layer = []
+        for data_ptr in layer:
+            array_type = (ctypes.c_int8 * element_size) 
+            raw_array = array_type.from_address(data_ptr)
+            tensor = torch.frombuffer(raw_array, dtype=torch.int8)
+            re_layer.append(tensor)
+        re.append(re_layer)
+    return re
+if __name__ == "__main__":
+    pass
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/metrics.cpp
+++ b/csrc/balance_serve/kvc2/src/metrics.cpp
+#include "metrics.h"
+namespace kvc2 {
+Metrics::Metrics(const MetricsConfig& config)
+    : registry_(std::make_shared<prometheus::Registry>()), exposer_(config.endpoint) {
+  // 注册 prefix_nodes Counter
+  auto& prefix_nodes_family = prometheus::BuildCounter()
+                                  .Name(std::string(METRIC_PREFIX) + "_prefix_nodes")
+                                  .Help("Number of prefix nodes")
+                                  .Register(*registry_);
+  prefix_nodes = &prefix_nodes_family.Add({});
+  // 注册 prefix_block_count Counter
+  auto& prefix_block_count_family = prometheus::BuildCounter()
+                                        .Name(std::string(METRIC_PREFIX) + "_prefix_block_count")
+                                        .Help("Number of prefix blocks")
+                                        .Register(*registry_);
+  prefix_block_count = &prefix_block_count_family.Add({});
+  // 定义统一的桶大小，最大为 10000 ms (10 s)
+  std::vector<double> common_buckets = {1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0, 5000.0, 10000.0};
+  // 注册 raw_insert_time_ms Histogram
+  auto& raw_insert_time_ms_family = prometheus::BuildHistogram()
+                                        .Name(std::string(METRIC_PREFIX) + "_raw_insert_time_ms")
+                                        .Help("function raw insert's time in milliseconds")
+                                        .Register(*registry_);
+  raw_insert_time_ms = &raw_insert_time_ms_family.Add({}, common_buckets);
+  // 注册 lookup_time_ms Histogram
+  auto& lookup_time_ms_family = prometheus::BuildHistogram()
+                                    .Name(std::string(METRIC_PREFIX) + "_lookup_time_ms")
+                                    .Help("function lookup's time in milliseconds")
+                                    .Register(*registry_);
+  lookup_time_ms = &lookup_time_ms_family.Add({}, common_buckets);
+  // 注册 lookup_prefixmatch_length Histogram
+  auto& lookup_prefixmatch_length_family = prometheus::BuildHistogram()
+                                               .Name(std::string(METRIC_PREFIX) + "_lookup_prefixmatch_length")
+                                               .Help("function lookup's prefix match length")
+                                               .Register(*registry_);
+  lookup_prefixmatch_length = &lookup_prefixmatch_length_family.Add({}, common_buckets);
+  // 注册 matched_length_percentage Histogram
+  auto& matched_length_percentage_family = prometheus::BuildHistogram()
+                                               .Name(std::string(METRIC_PREFIX) + "_matched_length_percentage")
+                                               .Help("function matched length percentage")
+                                               .Register(*registry_);
+  matched_length_percentage = &matched_length_percentage_family.Add({}, common_buckets);
+  // 注册 disk_usage Gauge
+  auto& disk_usage_family =
+      prometheus::BuildGauge().Name(std::string(METRIC_PREFIX) + "_disk_usage").Help("disk usage").Register(*registry_);
+  disk_usage = &disk_usage_family.Add({});
+  // 注册 memory_pool_size Gauge
+  memory_pool_size_family_ = &prometheus::BuildGauge()
+                                  .Name(std::string(METRIC_PREFIX) + "_memory_pool_size")
+                                  .Help("memory pool size")
+                                  .Register(*registry_);
+  // 注册 memory_pool_node_count Gauge
+  memory_pool_node_count_family_ = &prometheus::BuildGauge()
+                                        .Name(std::string(METRIC_PREFIX) + "_memory_pool_node_count")
+                                        .Help("memory pool node count")
+                                        .Register(*registry_);
+  // 注册 lru_entry_count Gauge
+  lru_entry_count_family_ = &prometheus::BuildGauge()
+                                 .Name(std::string(METRIC_PREFIX) + "_lru_entry_count")
+                                 .Help("lru entry count")
+                                 .Register(*registry_);
+  // 注册 gpu_page_count Gauge
+  gpu_page_count_family_ = &prometheus::BuildGauge()
+                                .Name(std::string(METRIC_PREFIX) + "_gpu_page_count")
+                                .Help("gpu page count")
+                                .Register(*registry_);
+  // 注册 append_tokens_time_ms Histogram
+  auto& append_tokens_time_ms_family = prometheus::BuildHistogram()
+                                           .Name(std::string(METRIC_PREFIX) + "_append_tokens_time_ms")
+                                           .Help("append tokens time in milliseconds")
+                                           .Register(*registry_);
+  append_tokens_time_ms = &append_tokens_time_ms_family.Add({}, common_buckets);
+  // 注册 gpu_flush_back_time_ms Histogram
+  auto& gpu_flush_back_time_ms_family = prometheus::BuildHistogram()
+                                            .Name(std::string(METRIC_PREFIX) + "_gpu_flush_back_time_ms")
+                                            .Help("gpu flush back time in milliseconds")
+                                            .Register(*registry_);
+  gpu_flush_back_time_ms = &gpu_flush_back_time_ms_family.Add({}, common_buckets);
+  // 注册 cpu_flush_back_time_ms Histogram
+  auto& cpu_flush_back_time_ms_family = prometheus::BuildHistogram()
+                                            .Name(std::string(METRIC_PREFIX) + "_cpu_flush_back_time_ms")
+                                            .Help("cpu flush back time in milliseconds")
+                                            .Register(*registry_);
+  cpu_flush_back_time_ms = &cpu_flush_back_time_ms_family.Add({}, common_buckets);
+  exposer_.RegisterCollectable(registry_);
+}
+// 析构函数
+Metrics::~Metrics() {
+  // 停止指标暴露
+  // exposer_.Stop();
+}
+// 获取 memory_pool_size 指标
+prometheus::Gauge* Metrics::memory_pool_size(const std::string& type) {
+  return &memory_pool_size_family_->Add({{"type", type}});
+}
+// 获取 memory_pool_node_count 指标
+prometheus::Gauge* Metrics::memory_pool_node_count(const std::string& type) {
+  return &memory_pool_node_count_family_->Add({{"type", type}});
+}
+// 获取 lru_entry_count 指标
+prometheus::Gauge* Metrics::lru_entry_count(const std::string& type) {
+  return &lru_entry_count_family_->Add({{"type", type}});
+}
+// 获取 gpu_page_count 指标
+prometheus::Gauge* Metrics::gpu_page_count(std::string type) {
+  return &gpu_page_count_family_->Add({{"type", type}});
+}
+TimeObserver::TimeObserver(prometheus::Histogram* h) {
+  histogram_ = h;
+  timer_.start();
+}
+TimeObserver::~TimeObserver() {
+  timer_.stop();
+  histogram_->Observe(timer_.elapsedNs() / 1e6);  // ns -> ms
+}
+}  // namespace kvc2
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/metrics.h
+++ b/csrc/balance_serve/kvc2/src/metrics.h
+#pragma once
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <string>
+#include <thread>
+#include <vector>
+#include "prometheus/counter.h"
+#include "prometheus/exposer.h"
+#include "prometheus/gauge.h"
+#include "prometheus/histogram.h"
+#include "prometheus/registry.h"
+#include "utils/timer.hpp"
+namespace kvc2 {
+// 指标前缀宏定义
+#define METRIC_PREFIX "kvc2"
+struct MetricsConfig {
+  std::string endpoint;  // 监听端点，如 "0.0.0.0:8080"
+};
+class Metrics {
+ public:
+  // 构造函数传入 MetricsConfig
+  Metrics(const MetricsConfig& config);
+  ~Metrics();
+  // 禁止拷贝和赋值
+  Metrics(const Metrics&) = delete;
+  Metrics& operator=(const Metrics&) = delete;
+  // 指标指针
+  prometheus::Counter* prefix_nodes;
+  prometheus::Counter* prefix_block_count;
+  prometheus::Histogram* raw_insert_time_ms;
+  prometheus::Histogram* lookup_time_ms;
+  prometheus::Histogram* lookup_prefixmatch_length;
+  prometheus::Histogram* matched_length_percentage;
+  prometheus::Gauge* disk_usage;
+  prometheus::Gauge* memory_pool_size(const std::string& type);
+  prometheus::Gauge* memory_pool_node_count(const std::string& type);
+  prometheus::Gauge* lru_entry_count(const std::string& type);
+  prometheus::Gauge* gpu_page_count(std::string type);
+  prometheus::Histogram* append_tokens_time_ms;
+  prometheus::Histogram* gpu_flush_back_time_ms;
+  prometheus::Histogram* cpu_flush_back_time_ms;
+ private:
+  std::shared_ptr<prometheus::Registry> registry_;
+  prometheus::Exposer exposer_;
+  prometheus::Family<prometheus::Gauge>* memory_pool_size_family_;
+  prometheus::Family<prometheus::Gauge>* memory_pool_node_count_family_;
+  prometheus::Family<prometheus::Gauge>* lru_entry_count_family_;
+  prometheus::Family<prometheus::Gauge>* gpu_page_count_family_;
+};
+class TimeObserver {
+ public:
+  TimeObserver(prometheus::Histogram* h);
+  ~TimeObserver();
+ private:
+  Timer timer_;
+  prometheus::Histogram* histogram_;
+};
+}  // namespace kvc2
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/model_config.h
+++ b/csrc/balance_serve/kvc2/src/model_config.h
+#ifndef __MODEL_CONFIG_HPP_
+#define __MODEL_CONFIG_HPP_
+#include "nlohmann/json.hpp"
+#include <iostream>
+#include <filesystem>
+#include <fstream>
+using DimSize = size_t;
+using URL = std::string;
+using ModelName = std::string;
+// We must assure this can be load by config.json
+class ModelConfig {
+public:
+  DimSize hidden_size;
+  DimSize intermediate_size;
+  size_t max_position_embeddings;
+  std::string model_type;
+  size_t num_attention_heads;
+  size_t num_hidden_layers;
+  size_t num_key_value_heads;
+  size_t vocab_size;
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(ModelConfig, hidden_size, intermediate_size,
+                                 max_position_embeddings, model_type,
+                                 num_attention_heads, num_hidden_layers,
+                                 num_key_value_heads, vocab_size);
+  void load_from(std::filesystem::path path) {
+    std::cout << "Load from " << path << std::endl;
+    std::ifstream i(path);
+    nlohmann::json j;
+    i >> j;
+    *this = j.get<ModelConfig>();
+  }
+};
+using QuantType = std::string;
+static const QuantType NoQuantType = "";
+class QuantConfig {
+public:
+  QuantType name;
+  // For GEMV
+  QuantType type_of_dot_vector = NoQuantType;
+  inline bool can_be_used_as_matrix() {
+    return type_of_dot_vector != NoQuantType;
+  }
+  bool can_be_used_as_vector;
+  double bytes_per_element;
+  bool has_scale;
+  bool has_min;
+  size_t block_element_count;
+  size_t block_element_size;
+  URL reference = "";
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(QuantConfig, name,
+                                              type_of_dot_vector,
+                                              can_be_used_as_vector,
+                                              bytes_per_element, has_scale,
+                                              has_min, block_element_count,
+                                              block_element_size, reference);
+};
+inline std::map<QuantType, QuantConfig> quant_configs;
+inline std::map<ModelName, ModelConfig> model_configs;
+inline void load_quant_configs(std::filesystem::path path) {
+  nlohmann::json j;
+  if (std::filesystem::exists(path)) {
+    std::cout << __FUNCTION__ << " from " << path << std::endl;
+    std::ifstream i(path);
+    i >> j;
+    quant_configs = j.get<std::map<QuantType, QuantConfig>>();
+    std::cout << "Loaded Quant Configs" << std::endl;
+    for (auto &[k, v] : quant_configs) {
+      std::cout << " - " << k << std::endl;
+    }
+  } else {
+    std::cout << __FUNCTION__ << " no file at " << path << std::endl;
+  }
+}
+inline void dump_quant_configs(std::filesystem::path path) {
+  std::ofstream o(path);
+  nlohmann::json j = quant_configs;
+  o << j.dump(4);
+}
+inline void load_model_configs(std::filesystem::path path) {
+  nlohmann::json j;
+  if (std::filesystem::exists(path)) {
+    std::cout << __FUNCTION__ << " from " << path << std::endl;
+    std::ifstream i(path);
+    i >> j;
+    model_configs = j.get<std::map<ModelName, ModelConfig>>();
+    std::cout << "Loaded Model Configs" << std::endl;
+    for (auto &[k, v] : model_configs) {
+      std::cout << " - " << k << std::endl;
+    }
+  } else {
+    std::cout << __FUNCTION__ << " no file at " << path << std::endl;
+  }
+}
+inline void dump_model_configs(std::filesystem::path path) {
+  std::ofstream o(path);
+  nlohmann::json j = model_configs;
+  o << j.dump(4);
+}
+#endif
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+++ b/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+#include "page_aligned_memory_pool.h"
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+#include "utils/arithmetic.hpp"
+#include "utils/easy_format.hpp"
+/// 构造函数
+PageAlignedMemoryPool::PageAlignedMemoryPool(size_t size_in_bytes) {
+  total_size = (size_in_bytes / PageSize) * PageSize;
+  // 对齐分配。C++17 对齐方式写法，如果编译器不支持可以改用其它方法
+  data = ::operator new[](total_size, std::align_val_t(PageSize));
+  total_pages = total_size / PageSize;
+  assert(total_pages >= Blocks);
+  page_per_block = total_pages / Blocks;
+  for (size_t block_index = 0; block_index < Blocks; block_index++) {
+    first_page[block_index] = reinterpret_cast<void*>(reinterpret_cast<intptr_t>(data) +
+                                                      static_cast<intptr_t>(block_index) * page_per_block * PageSize);
+    count_page[block_index] =
+        block_index == Blocks - 1 ? (total_pages - page_per_block * (Blocks - 1)) : page_per_block;
+    SPDLOG_DEBUG("first_page[{}] = {}, count_page[{}] = {}", block_index,
+                 reinterpret_cast<intptr_t>(first_page[block_index]) - reinterpret_cast<intptr_t>(data), block_index,
+                 count_page[block_index]);
+    bitmap[block_index].resize(count_page[block_index], 0);
+  }
+  SPDLOG_INFO("PageAlignedMemoryPool with size {} Mbytes, {} pages", total_size / (1 << 20), page_count());
+}
+/// 析构函数
+PageAlignedMemoryPool::~PageAlignedMemoryPool() {
+  if (data) {
+    // 注意：需要与分配时的对齐方式对应
+    ::operator delete[](data, std::align_val_t(PageSize));
+    data = nullptr;
+  }
+}
+/// 返回总页数
+size_t PageAlignedMemoryPool::page_count() {
+  return total_size / PageSize;
+}
+/// 返回按整页对齐后的字节数
+size_t PageAlignedMemoryPool::page_padded_size(size_t size) {
+  return div_up(size, PageSize) * PageSize;
+}
+void* PageAlignedMemoryPool::alloc_in_block(size_t block_index, size_t alloc_size) {
+  std::lock_guard<std::mutex> guard(lock[block_index]);
+  size_t free_pages = 0;
+  for (size_t i = 0; i < count_page[block_index]; i++) {
+    if (bitmap[block_index][i] == 0) {
+      free_pages++;
+      if (free_pages == alloc_size) {
+        size_t page_index = i + 1 - free_pages;
+        for (size_t page = page_index; page < page_index + alloc_size; page++) {
+          bitmap[block_index][page] = 1;
+          // SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
+        }
+        return reinterpret_cast<void*>(reinterpret_cast<intptr_t>(first_page[block_index]) + page_index * PageSize);
+      }
+    } else {
+      free_pages = 0;
+    }
+  }
+  return nullptr;
+}
+/// 分配函数
+void* PageAlignedMemoryPool::alloc(size_t size) {
+  size_t alloc_size = div_up(size, PageSize);
+  auto cnt = now_block.fetch_add(1, std::memory_order_relaxed);
+  for (size_t i = 0; i < Blocks; i++) {
+    auto result = alloc_in_block((i + cnt) % Blocks, alloc_size);
+    if (result != nullptr) {
+      allocated.fetch_add(alloc_size * PageSize, std::memory_order_relaxed);
+      alloc_count.fetch_add(1, std::memory_order_relaxed);
+      return result;
+    }
+  }
+  return nullptr;
+}
+/// 释放函数
+void PageAlignedMemoryPool::free(void* p, size_t size) {
+  auto alloc_size = div_up(size, PageSize);
+  size_t block_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(data)) / page_per_block / PageSize;
+  size_t page_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(first_page[block_index])) / PageSize;
+  std::lock_guard<std::mutex> guard(lock[block_index]);
+  for (size_t page = page_index; page < page_index + alloc_size; page++)
+    bitmap[block_index][page] = 0;
+  allocated.fetch_sub(alloc_size * PageSize, std::memory_order_relaxed);
+  free_count.fetch_add(1, std::memory_order_relaxed);
+}
+// TODO: too slow
+std::vector<void*> PageAlignedMemoryPool::alloc_multiple(size_t size, size_t count) {
+  std::vector<void*> result;
+  for (size_t i = 0; i < count; i++) {
+    auto p = alloc(size);
+    if (p == nullptr) {
+      for (auto ptr : result) {
+        free(ptr, size);
+      }
+      return {};
+    }
+    result.push_back(p);
+  }
+  return result;
+}
+void PageAlignedMemoryPool::defragment() {}
+/// 调试打印
+std::string PageAlignedMemoryPool::debug() {
+  return fmt::format("PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}\n",
+                     readable_number(total_size), readable_number(size_t(allocated)), size_t(alloc_count),
+                     size_t(free_count));
+}
--- a/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+++ b/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+#pragma once
+#include <assert.h>
+#include <algorithm>  // std::sort
+#include <atomic>
+#include <bitset>
+#include <cstddef>  // size_t
+#include <mutex>    // std::mutex
+#include <vector>
+constexpr size_t PageSize = 4096;
+/// PageAlignedMemoryPool 类的声明
+struct PageAlignedMemoryPool {
+ private:
+  constexpr static size_t Blocks = 16;
+  void* data = nullptr;
+  size_t total_size = 0, total_pages = 0;
+  std::atomic_size_t now_block = 0;
+  std::atomic_size_t allocated = 0;  // allocated_size
+  std::atomic_size_t alloc_count = 0;
+  std::atomic_size_t free_count = 0;
+  std::mutex lock[Blocks];
+  size_t page_per_block = 0;
+  void* first_page[Blocks];
+  size_t count_page[Blocks];
+  std::vector<int8_t> bitmap[Blocks];
+  void* alloc_in_block(size_t block_index, size_t alloc_size);
+ public:
+  /// 构造函数和析构函数
+  explicit PageAlignedMemoryPool(size_t size_in_bytes);
+  ~PageAlignedMemoryPool();
+  /// 禁用拷贝和移动
+  PageAlignedMemoryPool(PageAlignedMemoryPool&& other) = delete;
+  PageAlignedMemoryPool& operator=(PageAlignedMemoryPool&& other) = delete;
+  PageAlignedMemoryPool(const PageAlignedMemoryPool& other) = delete;
+  PageAlignedMemoryPool& operator=(const PageAlignedMemoryPool& other) = delete;
+  /// 成员函数
+  size_t page_count();
+  size_t page_padded_size(size_t size);
+  void* alloc(size_t size);
+  std::vector<void*> alloc_multiple(size_t size, size_t count);
+  void free(void* data, size_t size);
+  void defragment();
+  std::string debug();
+};
--- a/csrc/balance_serve/kvc2/src/prefix.cpp
+++ b/csrc/balance_serve/kvc2/src/prefix.cpp
+#include <immintrin.h>
+#include <tbb/concurrent_hash_map.h>
+#include <algorithm>
+#include <cstdint>
+#include <fstream>
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <nlohmann/json.hpp>
+#include <optional>
+#include <shared_mutex>
+#include <unordered_map>
+#include <vector>
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+#include "async_store.hh"
+#include "cuda_stream_manager.hh"
+#include "kvc2.h"
+#include "metrics.h"
+#include "cache_entry.hh"
+#include "gpu_cache.hh"
+#include "hasher.hpp"
+#include "io_helper.hpp"
+#include "page_aligned_memory_pool.h"
+#include "utils/arithmetic.hpp"
+#include "utils/easy_format.hpp"
+#include "utils/periodic_task.hpp"
+namespace kvc2 {
+struct KVC2;
+// will be set when init
+TokenLength NumTokenPerBlock;
+int EvictCount;
+using Layer = size_t;
+NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(CacheInfo, model_name, is_key_cache, quant_type);
+NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(KVC2Config, gpu_only, load_from_disk, save_to_disk, path, config_path,
+                                   num_token_per_page, memory_pool_size, evict_count, metrics_port, recompute_ratio);
+size_t CacheInfo::hidden_layer_count() {
+  return model_configs.at(model_name).num_hidden_layers;
+}
+std::filesystem::path CacheInfo::path(std::optional<size_t> which_layer) {
+  auto folder = std::filesystem::path(model_name) / quant_type / (is_key_cache ? "key" : "value");
+  if (which_layer.has_value()) {
+    folder /= fmt::format("layer-{}.kvc", which_layer.value());
+  }
+  return folder;
+}
+bool CacheInfo::operator==(const CacheInfo& other) const {
+  return model_name == other.model_name && is_key_cache == other.is_key_cache && quant_type == other.quant_type;
+}
+size_t CacheInfo::element_size(size_t block_length) {
+  size_t count = model_configs[model_name].hidden_size * block_length;
+  auto& q = quant_configs[quant_type];
+  return count / q.block_element_count * q.block_element_size;
+}
+size_t CacheInfo::hash_value() const {
+  size_t x = hash_seed;
+  x = XXH64(model_name.data(), model_name.size(), x);
+  x = XXH64("quant_type", 10, x);
+  x = XXH64(quant_type.data(), quant_type.size(), x);
+  if (is_key_cache) {
+    x = XXH64("key", 3, x);
+  } else {
+    x = XXH64("value", 5, x);
+  }
+  return x;
+}
+}  // namespace kvc2
+template <>
+struct std::hash<kvc2::CacheInfo> {
+  std::size_t operator()(const kvc2::CacheInfo& s) const noexcept { return s.hash_value(); }
+};
+namespace kvc2 {
+struct Location {
+  size_t start_idx;  // start block index
+  size_t length;     // length of blocks
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(Location, start_idx, length);
+  Location cut_tail(size_t offset_from_tail) {
+    Location re;
+    size_t offset = length - offset_from_tail;
+    re.start_idx = start_idx + offset;
+    re.length = offset_from_tail;
+    length = offset;
+    return re;
+  }
+};
+struct SegmentLocations {
+  std::vector<std::optional<size_t>> offsets;
+  void add_location(size_t start_block, Location location) {
+    if (location.length + start_block > offsets.size()) {
+      offsets.resize(location.length + start_block, std::nullopt);
+    }
+    for (size_t i = start_block; i < start_block + location.length; i++) {
+      offsets[i] = location.start_idx + i - start_block;
+    }
+  }
+  void set_location(size_t start_block, size_t disk_location) {
+    if (start_block >= offsets.size()) {
+      offsets.resize(start_block + 1, std::nullopt);
+    }
+    offsets[start_block] = disk_location;
+  }
+  std::optional<size_t> get_idx(size_t block_idx) const {
+    if (block_idx >= offsets.size()) {
+      return std::nullopt;
+    } else {
+      return offsets[block_idx];
+    }
+  }
+  bool has_location(size_t block_idx, size_t length) {
+    for (size_t i = block_idx; i < block_idx + length; i++) {
+      if (get_idx(i).has_value() == false) {
+        return false;
+      }
+    }
+    return true;
+  }
+  void debug() {
+    for (size_t i = 0; i < offsets.size(); ++i) {
+      if (offsets[i].has_value()) {
+        SPDLOG_DEBUG("Block {} -> Disk Location {}", i, offsets[i].value());
+      } else {
+        SPDLOG_DEBUG("Block {} -> No Disk Location", i);
+      }
+    }
+  }
+};
+struct CacheDiskLocations {
+  std::unordered_map<CacheInfo, Location> location_map;
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(CacheDiskLocations, location_map);
+  std::optional<Location> get_location(CacheInfo cache_info, TokenLength local_ids_length) {
+    size_t blocks_length = div_up(local_ids_length, NumTokenPerBlock);
+    if (location_map.count(cache_info) == 0) {
+      return std::nullopt;
+    }
+    Location re = location_map[cache_info];
+    re.length = blocks_length;
+    return re;
+  }
+  std::optional<size_t> get_location_of_a_block(CacheInfo info, size_t local_at) {
+    if (location_map.count(info) == 0) {
+      return std::nullopt;
+    }
+    auto loc = location_map[info];
+    if (local_at >= loc.length) {
+      return std::nullopt;
+    }
+    return loc.start_idx + local_at;
+  }
+};
+struct DiskCacheAllocator {
+ private:
+  // metadata
+  std::filesystem::path path;
+  CacheInfo info;
+  std::mutex lock;
+  size_t now_idx;
+  // store
+  size_t capacity;
+  std::vector<async_store::ArrayStore*> stores;
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(DiskCacheAllocator, now_idx);
+  void update_capacity() {
+    capacity = std::numeric_limits<size_t>::max();
+    for (auto& store : stores) {
+      capacity = std::min(capacity, async_store::capacity(store));
+    }
+  }
+  void extend(size_t to) {
+    for (size_t i = 0; i < info.hidden_layer_count(); i++) {
+      async_store::extend(stores[i], to);
+    }
+    update_capacity();
+  }
+ public:
+  async_store::ArrayStore* get_store(int i) { return stores[i]; }
+  Location alloc(size_t block_count) {
+    std::lock_guard<std::mutex> lg(lock);
+    Location re;
+    re.start_idx = now_idx;
+    re.length = block_count;
+    now_idx += block_count;
+    if (now_idx >= capacity) {
+      extend(capacity * 2);
+    }
+    return re;
+  }
+  DiskCacheAllocator(std::filesystem::path path, CacheInfo info) : path(path), info(info) {
+    // SPDLOG_DEBUG("Create DiskCacheAllocator {}", path.c_str());
+    auto allocator_path = path / info.path();
+    if (std::filesystem::exists(allocator_path) == false) {
+      std::filesystem::create_directories(allocator_path);
+    }
+    // restore metadata later in json load
+    now_idx = 0;
+    for (size_t i = 0; i < info.hidden_layer_count(); i++) {
+      // SPDLOG_DEBUG("Create store {} for {}", (path / info.path(i)).c_str(),i);
+      auto store = async_store::create_or_open_store(info.element_size(NumTokenPerBlock), 1000, path / info.path(i));
+      stores.push_back(store);
+    }
+    update_capacity();
+  }
+  ~DiskCacheAllocator() {
+    for (auto store : stores) {
+      async_store::close_store(store);
+    }
+  }
+};
+struct DiskCacheManager {
+  KVC2Config config;
+  std::mutex lock;
+  std::unordered_map<CacheInfo, std::shared_ptr<DiskCacheAllocator>> allocators;
+  friend void to_json(nlohmann ::json& nlohmann_json_j, const DiskCacheManager& nlohmann_json_t) {
+    nlohmann_json_j["config"] = nlohmann_json_t.config;
+    nlohmann_json_j["allocators"] = nlohmann::json::array();
+    for (auto& [info, allocator] : nlohmann_json_t.allocators) {
+      nlohmann_json_j["allocators"].push_back({{"info", info}, {"allocator", *allocator}});
+    }
+  }
+  friend void from_json(const nlohmann ::json& nlohmann_json_j, DiskCacheManager& nlohmann_json_t) {
+    // SPDLOG_DEBUG("Load DiskCacheManager Json");
+    nlohmann_json_j.at("config").get_to(nlohmann_json_t.config);
+    for (const auto& allocator_json : nlohmann_json_j.at("allocators")) {
+      // SPDLOG_DEBUG("Make Allocator {}",allocator_json.dump());
+      CacheInfo info;
+      allocator_json.at("info").get_to(info);
+      auto allocator = std::make_shared<DiskCacheAllocator>(nlohmann_json_t.config.path, info);
+      allocator_json.at("allocator").get_to(*allocator);
+      nlohmann_json_t.allocators[info] = allocator;
+    }
+  };
+  DiskCacheManager(KVC2Config config) : config(config) {
+    SPDLOG_INFO("DiskCacheManager root path: {}", config.path.c_str());
+    if (!std::filesystem::exists(config.path)) {
+      std::filesystem::create_directories(config.path);
+    }
+  }
+  std::shared_ptr<DiskCacheAllocator> get_allocator(CacheInfo info) {
+    {
+      std::lock_guard<std::mutex> lg(lock);
+      if (allocators.count(info) == 0) {
+        allocators.emplace(info, std::make_shared<DiskCacheAllocator>(config.path, info));
+      }
+    }
+    return allocators.at(info);
+  }
+  Location allocate(CacheInfo info, size_t cache_block_count) {
+    auto allocator = get_allocator(info);
+    return allocator->alloc(cache_block_count);
+  }
+};
+struct Prefix {
+  uint64_t prefix_id;  // 0 for nullptr, started from 1
+  TokenLength start_length;
+  Tokens ids;
+  CacheDiskLocations locations;
+  Prefix* prev = nullptr;
+  // No serialization
+  bool prev_set = false;
+  friend void to_json(nlohmann ::json& nlohmann_json_j, const Prefix& nlohmann_json_t) {
+    nlohmann_json_j["prefix_id"] = nlohmann_json_t.prefix_id;
+    nlohmann_json_j["start_length"] = nlohmann_json_t.start_length;
+    nlohmann_json_j["ids"] = nlohmann_json_t.ids;
+    if (nlohmann_json_t.prev) {
+      nlohmann_json_j["prev"] = nlohmann_json_t.prev->prefix_id;
+    } else {
+      nlohmann_json_j["prev"] = 0;
+    }
+    nlohmann_json_j["locations"] = nlohmann_json_t.locations;
+  }
+  friend void from_json(const nlohmann ::json& nlohmann_json_j, Prefix& nlohmann_json_t) {
+    nlohmann_json_j.at("prefix_id").get_to(nlohmann_json_t.prefix_id);
+    nlohmann_json_j.at("start_length").get_to(nlohmann_json_t.start_length);
+    nlohmann_json_j.at("ids").get_to(nlohmann_json_t.ids);
+    nlohmann_json_j.at("locations").get_to(nlohmann_json_t.locations);
+    auto prev_id = nlohmann_json_j.at("prev").get<uint64_t>();
+    nlohmann_json_t.prev = reinterpret_cast<Prefix*>(prev_id);
+    nlohmann_json_t.prev_set = false;
+  };
+  TokenLength local_length() { return ids.size(); }
+  TokenLength length() { return start_length + local_length(); }
+  Tokens prefix_to(TokenLength length) {
+    TokenLength local_length = length - start_length;
+    Tokens re;
+    if (prev) {
+      re = prev->prefix_to(start_length);
+    }
+    re.insert(re.end(), ids.begin(), ids.begin() + local_length);
+    return re;
+  }
+  Tokens full() { return prefix_to(length()); }
+  void update_location(CacheInfo info, Location location) { locations.location_map[info] = location; }
+  Prefix* to_first_prefix_without_disk_locations(CacheInfo k_info /*, CacheInfo v_info*/) {  // just k_info
+    auto now_prefix = this;
+    while (now_prefix->prev != nullptr) {
+      auto& prev = now_prefix->prev;
+      auto k_location = prev->locations.get_location(k_info, prev->local_length());
+      // auto v_location = prev->locations.get_location(v_info, prev->local_length());
+      if (k_location.has_value()) {
+        // assert(v_location.has_value());
+        // after now_prefix, we need to insert new kv cache.
+        break;
+      }
+      now_prefix = prev;
+    }
+    return now_prefix;
+  }
+  void hash_to_with(TokenLength length, TokensHasher& hasher) {
+    TokenLength local_length = length - start_length;
+    if (prev) {
+      prev->hash_to_with(start_length, hasher);
+    }
+    hasher.update(ids.data(), local_length);
+  }
+  void debug() {
+    fmt::print("Prefix {}, start_length: {}, local_length: {}, prev: {}, \n", prefix_id, start_length, local_length(),
+               (void*)prev);
+  }
+};
+struct PrefixMatch {
+  Prefix* prefix;
+  TokenLength match_length;
+  std::vector<TokensHash> matched_hashes(CacheInfo info, Layer layer) {
+    std::vector<TokensHash> re;
+    if (prefix == nullptr)
+      return re;
+    TokensHasher hasher;
+    hasher.reset(info.hash_value());
+    hasher.update_raw(&layer, sizeof(layer));
+    auto ids = prefix->prefix_to(match_length);
+    for (TokenLength i = 0; i < ids.size(); i += NumTokenPerBlock) {
+      TokenLength len = std::min(NumTokenPerBlock, ids.size() - i);
+      re.push_back(hasher.update(ids.data() + i, len));
+    }
+    return re;
+  }
+  void collect_locations(CacheInfo info, SegmentLocations& seg_locs) {
+    auto now_prefix = prefix;
+    size_t length = match_length;
+    while (now_prefix != nullptr) {
+      TokenLength local_length = length - now_prefix->start_length;
+      auto loc = now_prefix->locations.get_location(info, local_length);
+      if (loc.has_value()) {
+        seg_locs.add_location(now_prefix->start_length / NumTokenPerBlock, loc.value());
+      }
+      length = now_prefix->start_length;
+      now_prefix = now_prefix->prev;
+    }
+  }
+};
+std::string to_string(const MatchStatus& status) {
+  switch (status) {
+    case Exact:
+      return "Exact";
+    case Partial:
+      return "Partial";
+    case NotMatchExact:
+      return "NotMatchExact";
+    case NotMatchPartial:
+      return "NotMatchPartial";
+    default:
+      return "Unknown";
+  }
+}
+struct MatchByBlock {
+  // prefix, block idx at prefix, status
+  std::vector<std::tuple<Prefix*, BlockLength, MatchStatus>> matches;
+  bool any_match() {
+    for (auto& [p, l, m] : matches) {
+      if (p) {
+        return true;
+      }
+    }
+    return false;
+  }
+  size_t partial_count() {
+    size_t re = 0;
+    for (auto& [p, l, m] : matches) {
+      if (m == Partial) {
+        re++;
+      }
+    }
+    return re;
+  }
+  bool has_partial() { return partial_count() > 0; }
+  std::vector<std::optional<TokensHash>> matched_hashes(CacheInfo info, Layer layer) {
+    // TODO: This function might be slow
+    std::vector<std::optional<TokensHash>> re(matches.size(), std::nullopt);
+    for (size_t i = 0; i < matches.size(); i++) {
+      TokensHasher hasher;
+      hasher.reset(info.hash_value());
+      hasher.update_raw(&layer, sizeof(layer));
+      auto& [p, idx, status] = matches[i];
+      if (p) {
+        p->hash_to_with((idx + 1) * NumTokenPerBlock, hasher);
+        re[i] = hasher.get();
+      }
+    }
+    return re;
+  }
+  void collect_locations(CacheInfo info, SegmentLocations& seg_locs) {
+    for (size_t i = 0; i < matches.size(); i++) {
+      auto& [p, idx, status] = matches[i];
+      if (p) {
+        auto local_at = idx - p->start_length / NumTokenPerBlock;
+        seg_locs.set_location(i, p->locations.get_location_of_a_block(info, local_at).value());
+      }
+    }
+  }
+  std::string debug_string() {
+    std::string re = fmt::format("{} Match: ", matches.size());
+    for (auto& [p, idx, status] : matches) {
+      switch (status) {
+        case Exact:
+          re += "E";
+          break;
+        case Partial:
+          re += "P";
+          break;
+        case NotMatchExact:
+          re += "N";
+          break;
+        case NotMatchPartial:
+          re += "n";
+          break;
+        default:
+          assert(0);
+      }
+    }
+    return re;
+  }
+};
+struct PrefixTree {
+  std::shared_mutex rw_lock;
+  std::atomic_uint64_t prefix_id_counter = 1;
+  using MapT =
+      std::unordered_map<TokensHash, std::pair<std::shared_ptr<Prefix>, BlockLength>>;  // Prefix, start_block_idx
+  MapT prefix_map;
+  std::shared_ptr<Metrics> met;
+  std::vector<std::shared_ptr<Prefix>> prefix_refs = {nullptr};  // 0 is nullptr
+  friend void to_json(nlohmann ::json& nlohmann_json_j, const PrefixTree& nlohmann_json_t) {
+    nlohmann_json_j["prefix_id_counter"] = nlohmann_json_t.prefix_id_counter.load();
+    nlohmann_json_j["prefix_refs"] = nlohmann::json::array();
+    for (auto prefix : nlohmann_json_t.prefix_refs) {
+      if (prefix == nullptr)
+        continue;
+      nlohmann_json_j["prefix_refs"].push_back(*prefix);
+    }
+  }
+  friend void from_json(const nlohmann ::json& nlohmann_json_j, PrefixTree& nlohmann_json_t) {
+    nlohmann_json_t.prefix_id_counter = nlohmann_json_j.at("prefix_id_counter").get<uint64_t>();
+    nlohmann_json_t.prefix_refs.resize(nlohmann_json_t.prefix_id_counter);
+    for (size_t i = 1; i < nlohmann_json_t.prefix_id_counter; ++i) {
+      auto prefix = std::make_shared<Prefix>();
+      nlohmann_json_j.at("prefix_refs")[i - 1].get_to(*prefix);
+      nlohmann_json_t.prefix_refs[i] = prefix;
+    }
+    nlohmann_json_t.init_prevs();
+    nlohmann_json_t.init_map();
+  };
+  void init_prevs() {
+    for (auto p : prefix_refs) {
+      if (p) {
+        if (p->prev_set == false) {
+          p->prev = prefix_refs[reinterpret_cast<uint64_t>(p->prev)].get();
+          p->prev_set = true;
+        }
+      }
+    }
+  }
+  void init_map() {
+    assert(prefix_map.empty());
+    for (auto p : prefix_refs) {
+      if (p == nullptr)
+        continue;
+      auto ids = p->full();
+      for (TokenLength i = p->start_length; i < p->length(); i += NumTokenPerBlock) {
+        TokenLength end = std::min(i + NumTokenPerBlock, p->length());
+        assert(end % NumTokenPerBlock == 0);
+        auto hash = TokensHasher::hash(ids.data(), end);
+        prefix_map[hash] = {p, end / NumTokenPerBlock - 1};
+      }
+    }
+  }
+  // Look up prefix from the map, return the matched prefix and length.
+  // If the prefix is not found, match contains nullptr and 0.
+  PrefixMatch look_up(Token* data, TokenLength length, bool need_lock = true) {
+    std::shared_lock<std::shared_mutex> sl;
+    if (need_lock) {
+      sl = std::shared_lock<std::shared_mutex>(rw_lock);
+    }
+    // TODO: prefix cache
+  }
+  PrefixMatch look_up_or_insert(Token* data, TokenLength length) {
+    std::unique_lock<std::shared_mutex> ul(rw_lock);
+    auto match = look_up(data, length, false);
+    if (match.match_length == length) {
+      return match;
+    }
+    auto new_prefix = new_prefix_node(match.prefix, match.match_length, data, length, false);
+    PrefixMatch re;
+    re.prefix = new_prefix.get();
+    re.match_length = length;
+    return re;
+  }
+  std::shared_ptr<Prefix> new_prefix_node(Prefix* prev, TokenLength prev_match_length, Token* data, TokenLength length,
+                                          bool need_lock = true) {
+    std::unique_lock<std::shared_mutex> ul;
+    if (need_lock)
+      ul = std::unique_lock<std::shared_mutex>(rw_lock);
+    auto new_prefix = std::make_shared<Prefix>();
+    new_prefix->prefix_id = prefix_id_counter.fetch_add(1);
+    new_prefix->start_length = prev_match_length;
+    new_prefix->ids = Tokens(data + prev_match_length, data + length);
+    new_prefix->prev = prev;
+    new_prefix->prev_set = true;
+    prefix_refs.push_back(new_prefix);
+    met->prefix_nodes->Increment();
+    met->prefix_block_count->Increment(div_up(length - prev_match_length, NumTokenPerBlock));
+    assert(prefix_refs.size() == prefix_id_counter.load());
+    TokensHasher hasher;
+    hasher.update(data, prev_match_length);
+    for (TokenLength i = prev_match_length; i < length; i += NumTokenPerBlock) {
+      TokenLength len = std::min(NumTokenPerBlock, length - i);
+      auto hash = hasher.update(data + i, len);
+      prefix_map[hash] = {new_prefix, i / NumTokenPerBlock};
+    }
+    return new_prefix;
+  }
+  void debug() {
+    fmt::print("PrefixTree with {} prefixes, prefix counter: {}\n", prefix_map.size(), prefix_id_counter.load());
+    for (auto& [hash, prefix] : prefix_map) {
+      fmt::print("Hash: {:016x}, start block {}\n", hash, prefix.second);
+      prefix.first->debug();
+    }
+  }
+};
+size_t locations_blocks_count(const std::vector<Location>& locations) {
+  auto re = 0;
+  for (auto& loc : locations) {
+    re += loc.length;
+  }
+  return re;
+}
+struct DoubleCacheHandle : public DoubleCacheHandleInterface {
+  ModelName model_name;
+  QuantType quant_type;
+  bool is_k_cache_on;
+  bool is_v_cache_on;
+  CacheInfo k_info() {
+    if (is_k_cache_on == false) {
+      SPDLOG_WARN("Get K CacheInfo, but K Cache is off");
+    }
+    return CacheInfo{
+        .model_name = model_name,
+        .is_key_cache = true,
+        .quant_type = quant_type,
+    };
+  };
+  CacheInfo v_info() {
+    if (is_v_cache_on == false) {
+      SPDLOG_WARN("Get V CacheInfo, but K Cache is off");
+    }
+    return CacheInfo{
+        .model_name = model_name,
+        .is_key_cache = false,
+        .quant_type = quant_type,
+    };
+  };
+  Tokens ids;
+  TokenLength estimated_length;
+  bool enable_alt = false;
+  PrefixMatch match;
+  // MatchByBlock match_by_blocks;
+  std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>> k_cache_handles;
+  std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>> v_cache_handles;
+  SegmentLocations k_seg_locs;
+  SegmentLocations v_seg_locs;
+  KVC2* kvc2_top;
+  // for Cache Fusion
+  std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>> attatched_cache_handles;
+  std::unique_ptr<CacheBlockEntryCollector> cpu_releaser = nullptr, gpu_releaser = nullptr;
+  std::vector<size_t> gpu_only_block_idx;
+  virtual ~DoubleCacheHandle();
+  // interface
+  TokenLength matched_length() override {
+    if (enable_alt) {
+      assert(0);
+    } else {
+      return match.match_length;
+    }
+  }
+  MatchStatus status_at(BlockLength i) {
+    assert(i < div_up(estimated_length, NumTokenPerBlock));
+    if (enable_alt) {
+      assert(false);
+      // if (i >= match_by_blocks.matches.size()) {
+      //   return match_by_blocks.has_partial() ? MatchStatus::NotMatchPartial : MatchStatus::NotMatchExact;
+      // }
+      // return std::get<2>(match_by_blocks.matches[i]);
+    } else {
+      if (i < match.match_length / NumTokenPerBlock) {
+        return MatchStatus::Exact;
+      } else {
+        return MatchStatus::NotMatchExact;
+      }
+    }
+  }
+  std::vector<MatchStatus> matched_status() override { assert(false); }
+  bool any_match() {
+    if (enable_alt) {
+      assert(false);
+      // return match_by_blocks.any_match();
+    } else {
+      return match.prefix != nullptr;
+    }
+  }
+  BlockLength match_range_length() {
+    if (enable_alt) {
+      assert(false);
+      // return match_by_blocks.matches.size();
+    } else {
+      return div_up(match.match_length, NumTokenPerBlock);
+    }
+  }
+  std::vector<layer_data> handle_data(bool is_key_cache) override { return export_raw_pointers(is_key_cache); }
+  bool to_gpu() override;
+  void to_gpu_async(std::function<void(bool)> call_back) override;
+  std::vector<size_t> get_gpu_block_idx() override;
+  bool alloc_attached_blocks(BlockLength count);
+  std::vector<size_t> get_gpu_attached_block_idx() override;
+  void append_tokens(Token* tokens, TokenLength length) override;
+  void debug() override {}
+  void set_cache_info(ModelName model_name, QuantType quant_type, bool turn_on_k_cache, bool turn_on_v_cache) {
+    this->model_name = model_name;
+    this->quant_type = quant_type;
+    if (turn_on_k_cache) {
+      is_k_cache_on = true;
+      k_cache_handles.resize(k_info().hidden_layer_count());
+    } else {
+      is_k_cache_on = false;
+      k_cache_handles.clear();
+    }
+    if (turn_on_v_cache) {
+      is_v_cache_on = true;
+      v_cache_handles.resize(v_info().hidden_layer_count());
+    } else {
+      is_v_cache_on = false;
+      v_cache_handles.clear();
+    }
+  }
+  void check_before_insert() {
+    std::optional<size_t> blocks_count = std::nullopt;
+    auto check_single_cache = [&blocks_count](CacheInfo cache_info,
+                                              std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& layers,
+                                              Tokens& ids) {
+      for (size_t i = 0; i < cache_info.hidden_layer_count(); i++) {
+        auto& layer = layers[i];
+        if (blocks_count.has_value() == false) {
+          blocks_count = layer.size();
+        } else {
+          if (blocks_count.value() != layer.size()) {
+            SPDLOG_ERROR("Layer {} has different block count", i);
+            throw std::runtime_error("Layer has different block count");
+          }
+        }
+      }
+      if (blocks_count.has_value()) {
+        if (blocks_count.value() != div_up(ids.size(), NumTokenPerBlock)) {
+          SPDLOG_ERROR("Block count not match, ids: {}, blocks: {}", ids.size(), blocks_count.value());
+          throw std::runtime_error("Block count not match");
+        }
+      }
+    };
+    if (is_k_cache_on)
+      check_single_cache(k_info(), k_cache_handles, ids);
+    if (is_v_cache_on)
+      check_single_cache(v_info(), v_cache_handles, ids);
+  }
+  template <typename Fn>
+  void for_all_cache_block_entry(Fn f) {
+    if (is_k_cache_on) {
+      for (auto& layer : k_cache_handles) {
+        for (auto& block : layer) {
+          if (f(block) == false)
+            return;
+        }
+      }
+    }
+    if (is_v_cache_on) {
+      for (auto& layer : v_cache_handles) {
+        for (auto& block : layer) {
+          if (f(block) == false)
+            return;
+        }
+      }
+    }
+  }
+  // concurrent check ok
+  bool alloc_on_cpu() {
+    assert(cpu_releaser == nullptr);
+    std::unique_ptr<CacheBlockEntryCollector> releaser =
+        std::make_unique<CacheBlockEntryCollector>([](CacheBlockEntry* entry) {
+          auto lg = entry->lock_guard();
+          entry->cpu_cc.ref_count.fetch_sub(1);
+        });
+    bool ok = true;
+    for_all_cache_block_entry([&ok, &releaser](std::shared_ptr<CacheBlockEntry>& block_entry) {
+      if (block_entry->inc_ref_or_alloc_on_cpu() == false) {
+        ok = false;
+        return false;
+      } else {
+        releaser->entries.push_back(block_entry.get());
+      }
+      return true;
+    });
+    if (ok) {
+      cpu_releaser = std::move(releaser);
+    }
+    return ok;
+  }
+  bool alloc_on_gpu_cols() {
+    assert(is_k_cache_on);
+    assert(gpu_releaser == nullptr);
+    std::unique_ptr<CacheBlockEntryCollector> releaser =
+        std::make_unique<CacheBlockEntryCollector>([](CacheBlockEntry* entry) {
+          auto lg = entry->lock_guard();
+          entry->gpu_cc.ref_count.fetch_sub(1);
+        });
+    GPUPageCache* gpu_cache = k_cache_handles[0][0]->manager->gpu_cache.get();
+    gpu_cache->background_flush_back->wakeUpWait();
+    bool ok = true;
+    size_t want_count = 0;
+    for (size_t i = 0; i < k_cache_handles[0].size(); i++) {
+      auto lg = k_cache_handles[0][i]->lock_guard();
+      if (k_cache_handles[0][i]->gpu_block_idx.has_value() == false) {
+        want_count += 1;
+        if (gpu_cache->alloc_col(k_cache_handles, v_cache_handles, i) == false) {
+          ok = false;
+          break;
+        }
+      }
+      k_cache_handles[0][i]->gpu_cc.ref_count.fetch_add(1);
+      releaser->entries.push_back(k_cache_handles[0][i].get());
+    }
+    if (ok == false) {
+      SPDLOG_WARN("Handle cannot allocate {} gpu pages", want_count);
+    } else {
+      gpu_releaser = std::move(releaser);
+    }
+    return ok;
+  }
+  static void segment_io_layer(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper,
+                               async_store::ArrayStore* store,
+                               std::vector<std::shared_ptr<CacheBlockEntry>>& layer_entries, size_t block_start,
+                               size_t length, Layer layer, const SegmentLocations& locations, IOOption option) {
+    SPDLOG_TRACE("{} [{}:{}) blocks to/from disk", to_string(option), block_start, block_start + length);
+    for (size_t i = block_start; i < block_start + length; i++) {
+      if (locations.get_idx(i).has_value()) {
+        SPDLOG_TRACE("Location for block {}, {}", i, locations.get_idx(i).value());
+        layer_entries[i]->io_with(dealer, io_helper, store, layer, locations.get_idx(i).value(), option);
+      }
+    }
+  }
+  std::shared_ptr<IO_Helper<CacheBlockEntry>> segment_io(async_store::IODealer* dealer, DiskCacheManager* manager,
+                                                         BlockLength block_start, BlockLength length, IOOption option) {
+    auto io_helper = std::make_shared<IO_Helper<CacheBlockEntry>>([option](CacheBlockEntry* b) {
+      switch (option) {
+        case IO_ForceRead:
+          break;
+        case IO_ForceWrite:
+          break;
+        case IO_Read: {
+          b->cpu_cc.tc.set_has_data();
+          break;
+        }
+        case IO_Write:
+          break;
+        default:
+          assert(0);
+      }
+    });
+    auto single_segment_io = [dealer, manager, block_start, length, option, io_helper](
+                                 CacheInfo info, SegmentLocations& seg_locs,
+                                 std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& layers) {
+      assert(layers[0].size() >= block_start + length);
+      auto allocator = manager->get_allocator(info);
+      for (size_t l = 0; l < info.hidden_layer_count(); l++) {
+        segment_io_layer(dealer, *io_helper, allocator->get_store(l), layers[l], block_start, length, l, seg_locs,
+                         option);
+      }
+    };
+    if (is_k_cache_on)
+      single_segment_io(k_info(), k_seg_locs, k_cache_handles);
+    if (is_v_cache_on)
+      single_segment_io(v_info(), v_seg_locs, v_cache_handles);
+    io_helper->finish_add_taks();
+    SPDLOG_DEBUG("Segment IO Submitted, total task count {}", io_helper->total_task_count);
+    return io_helper;
+  }
+  std::shared_ptr<IO_Helper<CacheBlockEntry>> gpu_io(GPUPageCache* gpu_cache, BlockLength block_start,
+                                                     BlockLength length, IOOption option) {
+    auto io_helper = std::make_shared<IO_Helper<CacheBlockEntry>>([option](CacheBlockEntry* b) {
+      switch (option) {
+        case IO_ForceRead:
+          break;
+        case IO_ForceWrite:
+          break;
+        case IO_Read: {
+          b->gpu_cc.tc.set_has_data();
+          break;
+        }
+        case IO_Write:
+          break;
+        default:
+          assert(0);
+      }
+    });
+    cudaMemcpyKind direction;
+    if (option == IO_Read || option == IO_ForceRead) {
+      direction = cudaMemcpyHostToDevice;
+    }
+    if (option == IO_Write || option == IO_ForceWrite) {
+      direction = cudaMemcpyDeviceToHost;
+    }
+    auto reqs = gpu_cache->basic_request(direction, [io_helper]() { io_helper->batch_promise.set(); });
+    for (size_t i = block_start; i < length; i++) {
+      auto status = status_at(i);
+      if (status == NotMatchExact || status == NotMatchPartial) {
+        SPDLOG_DEBUG("GPU: Col Handle not match (Skipped by Alt Match)");
+        continue;
+      }
+      auto ptr = k_cache_handles[0][i].get();
+      switch (option) {
+        case IO_Read: {
+          if (io_helper->absorb_tc(ptr, ptr->gpu_cc.tc) == false) {
+            // SPDLOG_DEBUG("GPU: Col Handle need me to wait");
+            continue;
+          }
+          break;
+        }
+        case IO_ForceRead: {
+          break;
+        }
+        case IO_ForceWrite: {
+          break;
+        }
+        case IO_Write: {
+          break;
+        }
+        default: {
+          assert(0);
+        }
+      }
+      SPDLOG_DEBUG("GPU: Col Handle needs me to transfer");
+      gpu_cache->append_col_to_request(reqs, k_cache_handles, v_cache_handles, i);
+    }
+    io_helper->new_task(reqs.size());
+    gpu_cache->submit_requests(reqs);
+    io_helper->finish_add_taks();
+    return io_helper;
+  }
+  // void set_raw_handles(const std::vector<layer_data>& k, const std::vector<layer_data>& v) {
+  //   set_raw_handles(true, k);
+  //   set_raw_handles(false, v);
+  // }
+  void set_raw_handles(bool is_key_cache, const std::vector<layer_data>& layer_data) {
+    auto single_set_raw_handles = [layer_data](CacheInfo info,
+                                               std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& handles) {
+      handles.resize(layer_data.size());
+      for (size_t i = 0; i < info.hidden_layer_count(); i++) {
+        auto& layer = layer_data[i];
+        handles[i].clear();
+        for (auto& block_data : layer) {
+          auto handle = std::make_shared<CacheBlockEntry>();
+          handle->data = reinterpret_cast<void*>(block_data);
+          handle->size = info.element_size(NumTokenPerBlock);
+          handles[i].push_back(handle);
+        }
+      }
+    };
+    if (is_key_cache) {
+      is_k_cache_on = true;
+      single_set_raw_handles(k_info(), k_cache_handles);
+    } else {
+      is_v_cache_on = true;
+      single_set_raw_handles(v_info(), v_cache_handles);
+    }
+  }
+  std::vector<layer_data> export_raw_pointers(bool is_key_cache) {
+    std::vector<layer_data> re;
+    auto single_export_raw_pointers = [&re](std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& layers) {
+      for (auto& layer_handle : layers) {
+        layer_data layer;
+        for (size_t i = 0; i < layer_handle.size(); i++) {
+          auto block = layer_handle.at(i);
+          layer.push_back(reinterpret_cast<data_block_ptr>(block->data));
+        }
+        re.push_back(layer);
+      }
+    };
+    if (is_key_cache) {
+      if (is_k_cache_on == false) {
+        SPDLOG_WARN("Export K Cache, but K Cache is off");
+      }
+      single_export_raw_pointers(k_cache_handles);
+    } else {
+      if (is_v_cache_on == false) {
+        SPDLOG_WARN("Export V Cache, but V Cache is off");
+      }
+      single_export_raw_pointers(v_cache_handles);
+    }
+    return re;
+  }
+  void get_handles();
+  void get_empty_handles();
+  void collect_locations() {
+    if (enable_alt) {
+      assert(false);
+      // match_by_blocks.collect_locations(k_info(), k_seg_locs);
+      // match_by_blocks.collect_locations(v_info(), v_seg_locs);
+    } else {
+      if (is_k_cache_on)
+        match.collect_locations(k_info(), k_seg_locs);
+      if (is_v_cache_on)
+        match.collect_locations(v_info(), v_seg_locs);
+    }
+    if (is_k_cache_on)
+      k_seg_locs.debug();
+    // v_seg_locs.debug();
+  }
+};
+struct KVC2 : KVC2Interface {
+  KVC2Config config;
+  std::shared_ptr<Metrics> met;
+  std::filesystem::path root;
+  std::unique_ptr<PrefixTree> tree;
+  std::unique_ptr<DiskCacheManager> disk_cache;
+  std::shared_ptr<PageAlignedMemoryPool> memory_pool;
+  std::unique_ptr<CacheEntryManager> cache_manager;
+  std::unique_ptr<async_store::IODealer> io_dealer;
+  std::shared_ptr<GPUPageCache> gpu_cache;
+ public:
+  void load() override {
+    load_quant_configs(root / "quant_configs.json");
+    load_model_configs(root / "model_configs.json");
+    {
+      auto where = root / "tree.json";
+      if (std::filesystem::exists(where)) {
+        nlohmann::json j;
+        std::ifstream i(where);
+        i >> j;
+        j.get_to(*tree);
+        SPDLOG_WARN("Loaded from {}", where.c_str());
+      }
+    }
+    {
+      auto where = root / "disk_cache.json";
+      if (std::filesystem::exists(where)) {
+        nlohmann::json j;
+        std::ifstream i(where);
+        i >> j;
+        j.get_to(*disk_cache);
+        SPDLOG_WARN("Loaded from {}", where.c_str());
+      }
+    }
+    {
+      auto where = root / "config.json";
+      if (std::filesystem::exists(where)) {
+        nlohmann::json j;
+        std::ifstream i(where);
+        i >> j;
+        j.get_to(config);
+        SPDLOG_WARN("Loaded from {}", where.c_str());
+      }
+    }
+  }
+  void save() override {
+    if (config.save_to_disk == false) {
+      return;
+    }
+    flush_back();
+    {
+      nlohmann::json j;
+      j = *tree;
+      auto where = root / "tree.json";
+      std::ofstream o(where);
+      o << j;
+      SPDLOG_WARN("Serialized to {}", where.c_str());
+    }
+    {
+      nlohmann::json j;
+      j = *disk_cache;
+      auto where = root / "disk_cache.json";
+      std::ofstream o(where);
+      o << j;
+      SPDLOG_WARN("Serialized to {}", where.c_str());
+    }
+    {
+      nlohmann::json j;
+      j = config;
+      auto where = root / "config.json";
+      std::ofstream o(where);
+      o << j;
+      SPDLOG_WARN("Serialized to {}", where.c_str());
+    }
+    dump_quant_configs(root / "quant_configs.json");
+    dump_model_configs(root / "model_configs.json");
+  }
+  void raw_insert(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                  const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) override {
+    TimeObserver time_observer(met->raw_insert_time_ms);
+    SPDLOG_INFO("Raw Insert");
+    if (length % NumTokenPerBlock != 0) {
+      SPDLOG_WARN("Try to insert tokens with length {}, which is not a multiple of NumTokenPerBlock({}), getting floor",
+                  length, NumTokenPerBlock);
+      length = length / NumTokenPerBlock * NumTokenPerBlock;
+    }
+    auto h = std::make_shared<DoubleCacheHandle>();
+    h->kvc2_top = this;
+    h->set_cache_info(model_name, quant_type, config.k_cache_on, config.v_cache_on);
+    h->ids = Tokens(id, id + length);
+    if (config.k_cache_on)
+      h->set_raw_handles(true, k_cache);
+    if (config.v_cache_on)
+      h->set_raw_handles(false, v_cache);
+    h->check_before_insert();
+    h->match = tree->look_up_or_insert(id, length);
+    auto now_prefix = h->match.prefix;
+    assert(config.k_cache_on);
+    if (now_prefix->locations.get_location(h->k_info(), length - now_prefix->start_length).has_value()) {
+      assert(now_prefix->locations.get_location(h->v_info(), length - now_prefix->start_length).has_value());
+      SPDLOG_INFO("KV Cache Already on disk");
+      // already on disk
+    } else {
+      now_prefix = now_prefix->to_first_prefix_without_disk_locations(h->k_info());
+      // insert new kv cache locations
+      TokenLength new_length = length - now_prefix->start_length;
+      SPDLOG_DEBUG("Inserting new kv cache, length: {}", new_length);
+      assert(new_length > 0);
+      if (config.v_cache_on) {
+        // allocate a big space on disk
+        auto k_loc = disk_cache->allocate(h->k_info(), div_up(new_length, NumTokenPerBlock));
+        auto v_loc = disk_cache->allocate(h->v_info(), div_up(new_length, NumTokenPerBlock));
+        h->k_seg_locs.add_location(now_prefix->start_length / NumTokenPerBlock, k_loc);
+        h->v_seg_locs.add_location(now_prefix->start_length / NumTokenPerBlock, v_loc);
+        // split it to prefix trees
+        for (auto tail = h->match.prefix; tail != now_prefix->prev; tail = tail->prev) {
+          TokenLength local_ids_length = tail->local_length();
+          tail->update_location(h->k_info(), k_loc.cut_tail(div_up(local_ids_length, NumTokenPerBlock)));
+          tail->update_location(h->v_info(), v_loc.cut_tail(div_up(local_ids_length, NumTokenPerBlock)));
+        }
+        assert(k_loc.length == 0);
+        assert(v_loc.length == 0);
+      } else {
+        // allocate a big space on disk
+        auto k_loc = disk_cache->allocate(h->k_info(), div_up(new_length, NumTokenPerBlock));
+        h->k_seg_locs.add_location(now_prefix->start_length / NumTokenPerBlock, k_loc);
+        // split it to prefix trees
+        for (auto tail = h->match.prefix; tail != now_prefix->prev; tail = tail->prev) {
+          TokenLength local_ids_length = tail->local_length();
+          tail->update_location(h->k_info(), k_loc.cut_tail(div_up(local_ids_length, NumTokenPerBlock)));
+        }
+        assert(k_loc.length == 0);
+      }
+      // write new kv cache
+      auto disk_io_helper =
+          h->segment_io(io_dealer.get(), disk_cache.get(), now_prefix->start_length / NumTokenPerBlock,
+                        div_up(new_length, NumTokenPerBlock), IO_ForceWrite);
+      disk_io_helper->wait();
+    }
+  }
+  TokenLength raw_read(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                       const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) override {
+    SPDLOG_INFO("Raw Read");
+    auto h = std::make_shared<DoubleCacheHandle>();
+    h->kvc2_top = this;
+    h->set_cache_info(model_name, quant_type, config.k_cache_on, config.v_cache_on);
+    h->ids = Tokens(id, id + length);
+    if (config.k_cache_on)
+      h->set_raw_handles(true, k_cache);
+    if (config.v_cache_on)
+      h->set_raw_handles(false, v_cache);
+    h->match = tree->look_up(id, length);
+    if (h->match.prefix == nullptr) {
+      SPDLOG_INFO("Not Found");
+      return 0;
+    }
+    SPDLOG_DEBUG("Found {}", h->match.match_length);
+    h->collect_locations();
+    auto disk_io_helper = h->segment_io(io_dealer.get(), disk_cache.get(), 0,
+                                        div_up(h->match.match_length, NumTokenPerBlock), IO_ForceRead);
+    disk_io_helper->wait();
+    return h->match.match_length;
+  }
+  std::shared_ptr<DoubleCacheHandleInterface> lookup(ModelName model_name, QuantType quant_type, Token* id,
+                                                     TokenLength length, TokenLength estimated_length) override {
+    TimeObserver time_observer(met->lookup_time_ms);
+    auto re = std::make_shared<DoubleCacheHandle>();
+    re->set_cache_info(model_name, quant_type, config.k_cache_on, config.v_cache_on);
+    re->ids = Tokens(id, id + length);
+    re->estimated_length = estimated_length;
+    re->kvc2_top = this;
+    SPDLOG_DEBUG("Lookup TokenLength {}", length);
+    if (config.gpu_only == false) {
+      // TODO:
+    }
+    return re;
+  };
+  std::shared_ptr<DoubleCacheHandleInterface> lookup_to_gpu(ModelName model_name, QuantType quant_type, Token* id,
+                                                            size_t length, size_t estimated_length) override {
+    std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
+    lookup_to_gpu_async(model_name, quant_type, id, length, estimated_length, [&p](auto re) { p.set_value(re); });
+    return p.get_future().get();
+  }
+  void lookup_to_gpu_async(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
+                           TokenLength estimated_length,
+                           std::function<void(std::shared_ptr<DoubleCacheHandleInterface>)> call_back) override {
+    auto re = lookup(model_name, quant_type, id, length, estimated_length);
+    if (re == nullptr) {
+      call_back(nullptr);
+      return;
+    }
+    auto h = static_cast<DoubleCacheHandle*>(re.get());
+    if (config.gpu_only) {
+      auto total_block_count = div_up(estimated_length, NumTokenPerBlock);
+      h->gpu_only_block_idx = gpu_cache->gpu_only_alloc_col(total_block_count);
+      if (h->gpu_only_block_idx.empty()) {
+        call_back(nullptr);
+      } else {
+        call_back(re);
+      }
+    } else {
+      if (h->k_info().hidden_layer_count() != gpu_cache->config.layer_count) {
+        SPDLOG_ERROR("GPU Cache Layer Count not match");
+        assert(false);
+      }
+      if (h->alloc_on_gpu_cols() == false) {
+        call_back(nullptr);
+        return;
+      }
+      h->to_gpu_async([call_back, re](bool ok) {
+        if (ok) {
+          call_back(re);
+        } else {
+          call_back(nullptr);
+        }
+      });
+    }
+  }
+  std::pair<std::vector<torch::Tensor>, std::vector<torch::Tensor>> get_kvcache() override {
+    return {gpu_cache->k_cache, gpu_cache->v_cache};
+  }
+  void flush_back() {
+    gpu_cache->background_flush_back->wakeUpWait();
+    cache_manager->background_flush_back->wakeUpWait();
+  }
+  void debug() override {
+    cache_manager->debug();
+    tree->debug();
+  }
+  virtual ~KVC2() { flush_back(); };
+  KVC2(KVC2Config config) : config(config) {
+    SPDLOG_INFO("Creating KVC2 using these config");
+    SPDLOG_INFO("    GPU Only: {}", config.gpu_only);
+    SPDLOG_INFO("    Load: {}, Save: {}", config.load_from_disk, config.save_to_disk);
+    SPDLOG_INFO("    Path: {}", config.path);
+    SPDLOG_INFO("    Config Path: {}", config.config_path);
+    SPDLOG_INFO("    Num Token/Page: {}, Memory Pool Size: {}", config.num_token_per_page,
+                readable_number(config.memory_pool_size));
+    SPDLOG_INFO("    Evict Count: {}, Metrics Port: {}", config.evict_count, config.metrics_port);
+    SPDLOG_INFO("    Recompute Ratio: {:.2f}", config.recompute_ratio);
+    if (config.gpu_cache_config) {
+      const auto& gpu_config = *config.gpu_cache_config;
+      SPDLOG_INFO("    GPU Devices: {}", format_vector(gpu_config.gpu_devices_id));
+      SPDLOG_INFO("    Layer Count: {}, Total KVCache Pages: {}", gpu_config.layer_count,
+                  gpu_config.total_kvcache_pages);
+      SPDLOG_INFO("    Num Token/Page: {}, Num K Heads: {}", gpu_config.num_token_per_page, gpu_config.num_k_heads);
+      SPDLOG_INFO("    K Head Dim: {}, Tensor Type: {}", gpu_config.k_head_dim,
+                  static_cast<int>(gpu_config.tensor_type));
+      SPDLOG_INFO("    MemcpyCudaStreams/Device: {}", gpu_config.num_streams_per_device);
+    } else {
+      SPDLOG_INFO("    GPU Cache Config: None");
+    }
+    load_model_configs(config.config_path + "/model_configs.json");
+    load_quant_configs(config.config_path + "/quant_configs.json");
+    // met
+    MetricsConfig met_conf;
+    met_conf.endpoint = "0.0.0.0:" + std::to_string(config.metrics_port);
+    SPDLOG_INFO("Creating kvc2 metrics exporter on {}", met_conf.endpoint);
+    met = std::make_shared<Metrics>(met_conf);
+    if (config.gpu_only == false) {
+      if (config.k_cache_on == false) {
+        SPDLOG_ERROR("if k_cache_on is false, gpu_only must be true");
+        assert(false);
+      }
+      root = config.path;
+      tree = std::make_unique<PrefixTree>();
+      disk_cache = std::make_unique<DiskCacheManager>(config);
+      memory_pool = std::make_shared<PageAlignedMemoryPool>(config.memory_pool_size);
+      cache_manager = std::unique_ptr<CacheEntryManager>(
+          new CacheEntryManager(CacheEntryManagerConfig{.evict_count = config.evict_count, .kvc2_top = this}));
+      cache_manager->pool = memory_pool;
+      io_dealer = std::make_unique<async_store::IODealer>();
+      io_dealer->start_io_thread().detach();
+      tree->met = met;
+      if (config.gpu_cache_config.has_value()) {
+        gpu_cache = std::make_shared<GPUPageCache>(config.gpu_cache_config.value());
+        cache_manager->gpu_cache = gpu_cache;
+      }
+      cache_manager->cpu_background_flush();
+      gpu_cache->gpu_background_flush();
+    } else {
+      SPDLOG_CRITICAL("GPU ONLY MODE, NO PREFIX CACHE");
+      gpu_cache = std::make_shared<GPUPageCache>(config.gpu_cache_config.value());
+    }
+  }
+};
+std::shared_ptr<KVC2Interface> create_kvc2(KVC2Config config) {
+  NumTokenPerBlock = config.num_token_per_page;
+  EvictCount = config.evict_count;
+  // SPDLOG_WARN("Sizeof KVC2Config {} here", sizeof(KVC2Config));
+  return std::make_shared<KVC2>(config);
+}
+DoubleCacheHandle::~DoubleCacheHandle() {
+  if (kvc2_top->config.gpu_only) {
+    kvc2_top->gpu_cache->gpu_only_free_cols(gpu_only_block_idx);
+  } else {
+    for_all_cache_block_entry([](std::shared_ptr<CacheBlockEntry>& block_entry) {
+      block_entry->lock_guard();
+      if (block_entry->with_key == false && block_entry->data != nullptr) {
+        block_entry->free_on_cpu();
+      }
+      return true;
+    });
+  }
+};
+void DoubleCacheHandle::get_handles() {
+  size_t new_count = 0, total_count = 0;
+  auto get_info_handles = [this, &new_count, &total_count](
+                              CacheInfo info, std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& layers) {
+    auto total_block_count = div_up(estimated_length, NumTokenPerBlock);
+    for (size_t l = 0; l < info.hidden_layer_count(); l++) {
+      auto hashes = match.matched_hashes(info, l);
+      layers[l].resize(total_block_count, nullptr);
+      for (size_t i = 0; i < total_block_count; i++) {
+        std::optional<CacheEntryManager::Key> key = std::nullopt;
+        if (i < hashes.size())
+          key = hashes[i];
+        bool is_new;
+        total_count += 1;
+        layers[l][i] = this->kvc2_top->cache_manager->get(is_new, info.element_size(NumTokenPerBlock), key);
+        if (is_new)
+          new_count += 1;
+        layers[l][i]->cache_info = info;
+        layers[l][i]->layer = l;
+      }
+    }
+  };
+  if (kvc2_top->config.k_cache_on)
+    get_info_handles(k_info(), k_cache_handles);
+  if (kvc2_top->config.v_cache_on)
+    get_info_handles(v_info(), v_cache_handles);
+  SPDLOG_INFO("New Handles: {}/{}", new_count, total_count);
+}
+bool DoubleCacheHandle::to_gpu() {
+  std::promise<bool> p;
+  to_gpu_async([&p](bool ok) { p.set_value(ok); });
+  return p.get_future().get();
+}
+void DoubleCacheHandle::to_gpu_async(std::function<void(bool)> call_back) {
+  if (enable_alt) {
+    assert(false);
+    // size_t page_size = kvc2_top->config.num_token_per_page;
+    // BlockLength count =
+    //     div_up(TokenLength(std::ceil(match_by_blocks.partial_count() * page_size *
+    //     kvc2_top->config.recompute_ratio)),
+    //            page_size);
+    // if (alloc_attached_blocks(count) == false) {
+    //   SPDLOG_WARN("Cannot allocate attached GPU block");
+    //   call_back(false);
+    //   return;
+    // } else {
+    //   SPDLOG_INFO("Allocated {} attached GPU blocks", count);
+    // }
+  }
+  // don't wait here
+  if (any_match() == false) {
+    SPDLOG_INFO("No match, No need to load to gpu");
+    call_back(true);
+    return;
+  }
+  auto gpu_io_helper = gpu_io(kvc2_top->gpu_cache.get(), 0, match_range_length(), IO_Read);
+  gpu_io_helper->call_back = [call_back]() { call_back(true); };
+  // Ok this is very stupid, but I have to do this for now
+  std::thread([gpu_io_helper]() { gpu_io_helper->wait(); }).detach();
+}
+bool DoubleCacheHandle::alloc_attached_blocks(BlockLength count) {
+  // attached_vertical_handles.resize(count);
+  // for (size_t i = 0; i < count; i++) {
+  //   attached_vertical_handles[i] = std::shared_ptr<DoubleVerticalBlocksHandle>(new DoubleVerticalBlocksHandle);
+  //   attached_vertical_handles[i]->gpu_only = true;
+  // }
+  // return kvc2_top->gpu_cache->alloc_pages(attached_vertical_handles);
+  return true;
+}
+std::vector<size_t> DoubleCacheHandle::get_gpu_attached_block_idx() {
+  std::vector<size_t> re;
+  // for (auto& h : attached_vertical_handles) {
+  //   re.push_back(h->gpu_block_idx.value());
+  // }
+  return re;
+}
+void CacheBlockEntry::set_key(TokensHash key, std::shared_ptr<CacheBlockEntry> me) {
+  assert(with_key == false);
+  with_key = true;
+  hash = key;
+  // SPDLOG_DEBUG("Insert New Gen KVCache, key {}", key);
+  std::lock_guard<std::mutex> manager_lg(manager->lock);
+  if (manager->key_entry_map.contains(me->hash)) {
+    SPDLOG_WARN("Duplicate key {}", me->hash);
+  } else {
+    manager->insert(me);
+  }
+}
+std::vector<size_t> DoubleCacheHandle::get_gpu_block_idx() {
+  if (kvc2_top->config.gpu_only) {
+    return gpu_only_block_idx;
+  } else {
+    std::vector<size_t> re;
+    for (auto& handle : k_cache_handles[0]) {
+      re.push_back(handle->gpu_block_idx.value());
+    }
+    return re;
+  }
+}
+/*
+length : total length of tokens (including matched tokens)
+  1. update key, insert CacheBlock hash to lru
+  2. set dirty flag
+  3. update prefix tree, allocate new disk location
+*/
+void DoubleCacheHandle::append_tokens(Token* all_tokens, TokenLength length) {
+  if (kvc2_top->config.gpu_only) {
+    return;
+  }
+  TimeObserver time_observer(kvc2_top->met->append_tokens_time_ms);
+  if (enable_alt) {
+    SPDLOG_WARN("Append Tokens Not Implemented for Alternative Path");
+    return;
+  }
+  if (length > estimated_length) {
+    SPDLOG_ERROR("Length {} exceed estimated length {}", length, estimated_length);
+    assert(false);
+  }
+  size_t match_length = matched_length();
+  if (length < match_length) {
+    SPDLOG_WARN("Length {} less than match length {}", length, match_length);
+    assert(false);
+  }
+  if (length > ids.size()) {
+    ids.insert(ids.end(), all_tokens + ids.size(), all_tokens + length);
+  }
+  static const auto num_token_per_page = kvc2_top->config.num_token_per_page;
+  if (match_length % num_token_per_page != 0) {
+    SPDLOG_ERROR("Match length {} is not multiple of num_token_per_page {}", match_length, num_token_per_page);
+    assert(false);
+  }
+  if (match_length + num_token_per_page > length) {
+    // SPDLOG_DEBUG("append_tokens No need to update");
+    return;
+  }
+  SPDLOG_DEBUG("Append Tokens to {}", length);
+  auto pre_match_length = match_length;
+  // set gpu dirty flag
+  size_t new_added_block_count = 0;
+  while (match_length + num_token_per_page <= length) {
+    match_length += num_token_per_page;
+    new_added_block_count += 1;
+  }
+  // update prefix tree
+  match.prefix = kvc2_top->tree->new_prefix_node(match.prefix, pre_match_length, ids.data(), match_length).get();
+  match.match_length = match_length;
+  // alloc disk location for new added prefix
+  auto disk_cache = kvc2_top->disk_cache.get();
+  Location k_loc{0, 0}, v_loc{0, 0};
+  if (is_k_cache_on) {
+    k_loc = disk_cache->allocate(k_info(), new_added_block_count);
+    k_seg_locs.add_location(match.prefix->start_length / NumTokenPerBlock, k_loc);
+    match.prefix->update_location(k_info(), k_loc);
+  }
+  if (is_v_cache_on) {
+    v_loc = disk_cache->allocate(v_info(), new_added_block_count);
+    v_seg_locs.add_location(match.prefix->start_length / NumTokenPerBlock, v_loc);
+    match.prefix->update_location(v_info(), v_loc);
+  }
+  // update cache handles
+  auto update_cache_handles = [this, pre_match_length, length](
+                                  CacheInfo info, std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& layers,
+                                  Location loc) {
+    TokensHasher hasher;
+    for (Layer l = 0; l < info.hidden_layer_count(); l++) {
+      hasher.reset(info.hash_value());
+      hasher.update_raw(&l, sizeof(l));
+      hasher.update(ids.data(), pre_match_length);
+      auto page_count_start = pre_match_length / num_token_per_page;
+      for (size_t i = pre_match_length; i + num_token_per_page <= length; i += num_token_per_page) {
+        auto page_count = i / num_token_per_page;
+        hasher.update(ids.data() + i, num_token_per_page);
+        auto block = layers[l][page_count];
+        {
+          auto lg = block->lock_guard();
+          block->idx = loc.start_idx + page_count - page_count_start;
+          block->set_key(hasher.get(), block);
+          if (l == 0 && info.is_key_cache) {
+            block->gpu_cc.tc.set_has_data();
+          }
+          block->gpu_cc.dirty.store(true);
+        }
+      }
+    }
+  };
+  if (is_k_cache_on) {
+    update_cache_handles(k_info(), k_cache_handles, k_loc);
+  }
+  if (is_v_cache_on) {
+    update_cache_handles(v_info(), v_cache_handles, v_loc);
+  }
+  // kvc2_top->block_cache->debug();
+}
+void CacheBlockEntry::flush_back_async(IO_Helper<CacheBlockEntry>& helper,
+                                       std::vector<std::atomic_bool*>& dirty_flags) {
+  auto kvc2_top = manager->config.kvc2_top;
+  auto allocator = kvc2_top->disk_cache->get_allocator(cache_info);
+  // if (layer == 0) {
+  //   SPDLOG_DEBUG("Flush {} to {}", fmt::ptr(this), idx);
+  // }
+  io_with(kvc2_top->io_dealer.get(), helper, allocator->get_store(layer), layer, idx, IOOption::IO_Write);
+  dirty_flags.push_back(&cpu_cc.dirty);
+}
+void CacheEntryManager::cpu_background_flush() {
+  if (background_flush_back.get() == nullptr) {
+    SPDLOG_INFO("Starting CPU Background flush");
+    background_flush_back = std::unique_ptr<periodic::PeriodicTask>(new periodic::PeriodicTask([this]() {
+      // Timer t("CPU Flush");
+      std::vector<std::atomic_bool*> dirty_cpus;
+      std::vector<std::unique_lock<CacheBlockEntry::MutexT>> entry_uls;
+      IO_Helper<CacheBlockEntry> io_helper(nullptr, [&dirty_cpus]() {
+        for (auto& flag : dirty_cpus) {
+          flag->store(false);
+        }
+        if (dirty_cpus.size() > 0)
+          SPDLOG_DEBUG("{} dirty CPU pages flushed.", dirty_cpus.size());
+      });
+      {
+        std::lock_guard<std::mutex> ul(lock);
+        for (auto& e : usage_list) {
+          auto ul = e->try_lock();
+          if (ul.owns_lock()) {
+            if (e->cpu_cc.dirty.load()) {
+              entry_uls.push_back(std::move(ul));
+              e->flush_back_async(io_helper, dirty_cpus);
+            }
+          }
+          // if (dirty_cpus.size() == 100) {
+          //   break;
+          // }
+        }
+      }
+      io_helper.finish_add_taks();
+      io_helper.wait();
+    }));
+  } else {
+    SPDLOG_ERROR("Flush Thread Already Started");
+  }
+}
+void GPUPageCache::gpu_background_flush() {
+  if (background_flush_back.get() == nullptr) {
+    SPDLOG_INFO("Starting GPU Background flush");
+    background_flush_back = std::unique_ptr<periodic::PeriodicTask>(new periodic::PeriodicTask([this]() {
+      // Timer t("GPU Flush");
+      std::vector<size_t> dirty_cols;
+      std::vector<CacheBlockEntry*> entries;
+      std::vector<std::unique_lock<CacheBlockEntry::MutexT>> uls;
+      BatchPromise promise(config.gpu_devices_id.size());
+      auto reqs = basic_request(cudaMemcpyDeviceToHost, [&promise]() { promise.set(); });
+      for (size_t i = 0; i < config.total_kvcache_pages; i++) {
+        std::lock_guard<std::mutex> lg(this->lock);
+        auto col_uls = try_lock_col(i);
+        if (col_uls.empty())
+          continue;
+        for (size_t l = 0; l < config.layer_count; l++) {
+          if (config.k_cache_on &&
+              (occupations[l][i]->gpu_cc.dirty.load() == false || occupations[l][i]->cpu_cc.dirty.load()))
+            goto next_gpu_page;
+          if (config.v_cache_on &&
+              (v_occupations[l][i]->gpu_cc.dirty.load() == false || v_occupations[l][i]->cpu_cc.dirty.load()))
+            goto next_gpu_page;
+        }
+        dirty_cols.push_back(i);
+        for (size_t l = 0; l < config.layer_count; l++) {
+          // occupations[l][i]->alloc_on_cpu_no_lock();
+          if (config.k_cache_on)
+            entries.push_back(occupations[l][i].get());
+          if (config.v_cache_on)
+            entries.push_back(v_occupations[l][i].get());
+        }
+        append_col_to_request(reqs, occupations, v_occupations, i);
+        for (auto& ul : col_uls) {
+          uls.push_back(std::move(ul));
+        }
+      next_gpu_page:
+        continue;
+      }
+      submit_requests(reqs);
+      promise.get_shared_fut().wait();
+      if (dirty_cols.empty() == false)
+        SPDLOG_INFO("GPU Flushed Back {} cols", dirty_cols.size());
+      for (auto& entry : entries) {
+        entry->cpu_cc.tc.set_has_data();
+        // we have locks here
+        entry->cpu_cc.dirty.store(true);
+      }
+      for (auto& col : dirty_cols) {
+        for (size_t l = 0; l < config.layer_count; l++) {
+          if (config.k_cache_on)
+            occupations[l][col]->gpu_cc.dirty.store(false);
+          if (config.v_cache_on)
+            v_occupations[l][col]->gpu_cc.dirty.store(false);
+        }
+      }
+      if (dirty_cols.empty() == false) {
+        debug();
+      }
+    }));
+  } else {
+    SPDLOG_ERROR("Flush Thread Already Started");
+  }
+}
+}  // namespace kvc2
--- a/csrc/balance_serve/kvc2/src/utils/all.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/all.hpp
+#pragma once
+#include "easy_format.hpp"
+#include "timer.hpp"
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
+#include <memory>
+#include <type_traits>
+template <typename T, typename U>
+T div_up(T x, U by) {
+  static_assert(std::is_integral_v<T>);
+  static_assert(std::is_integral_v<U>);
+  return (x + by - 1) / by;
+}
+template <typename T>
+T* offset_by_bytes(T* t, size_t n) {
+  return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
+}
--- a/csrc/balance_serve/kvc2/src/utils/easy_format.hpp
+++ b/csrc/balance_serve/kvc2/src/utils/easy_format.hpp
+#ifndef __EASY_FORMAT_HPP_
+#define __EASY_FORMAT_HPP_
+#include <array>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <vector>
+template <typename T>
+inline std::string format_vector(const std::vector<T>& v) {
+  std::ostringstream oss;
+  if (v.empty())
+    return "[]";
+  for (size_t i = 0; i < v.size(); ++i) {
+    oss << v[i];
+    if (i < v.size() - 1)
+      oss << ", ";  // 逗号分隔
+  }
+  return oss.str();
+}
+inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
+inline std::string readable_number(size_t size) {
+  size_t unit_index = 0;
+  double readable_size = size;
+  while (readable_size >= 1000 && unit_index < units.size() - 1) {
+    readable_size /= 1000;
+    unit_index++;
+  }
+  std::ostringstream ss;
+  ss << std::fixed << std::setprecision(2) << readable_size;
+  std::string str = ss.str();
+  return str + "" + units[unit_index];
+}
+#endif
\ No newline at end of file