Merge branch 'kvcache-ai:main' into main

877aec85 · Yuhao Tsui · GitHub · 84164f58 · 9037bf30 · 877aec85
Unverified Commit 877aec85 authored Apr 09, 2025 by Yuhao Tsui Committed by GitHub Apr 09, 2025
20 changed files
--- a/.github/workflows/package_wheel_release.yml
+++ b/.github/workflows/package_wheel_release.yml
@@ -163,6 +163,8 @@ jobs:

      - name: build for cuda
        if: matrix.cuda != ''
+        env:
+          USE_BALANCE_SERVE: "1"
        run: |
          git submodule init
          git submodule update

--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,16 @@
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/spdlog"]
+	path = third_party/spdlog
+	url = https://github.com/gabime/spdlog.git
+[submodule "third_party/custom_flashinfer"]
+	path = third_party/custom_flashinfer
+	url = https://github.com/kvcache-ai/custom_flashinfer.git
+	branch = fix-precision-mla-merge-main
+[submodule "third_party/xxHash"]
+	path = third_party/xxHash
+	url = https://github.com/Cyan4973/xxHash.git
+[submodule "third_party/prometheus-cpp"]
+	path = third_party/prometheus-cpp
+	url = https://github.com/jupp0r/prometheus-cpp
--- a/Dockerfile
+++ b/Dockerfile
-FROM node:20.16.0 as web_compile
-WORKDIR /home
-RUN <<EOF
-git clone https://github.com/kvcache-ai/ktransformers.git &&
-cd ktransformers/ktransformers/website/ &&
-npm install @vue/cli &&
-npm run build &&
-rm -rf node_modules
-EOF
-
+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server


-FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
 ARG CPU_INSTRUCT=NATIVE
+
+# 设置工作目录和 CUDA 路径
 WORKDIR /workspace
-ENV CUDA_HOME /usr/local/cuda
-COPY --from=web_compile /home/ktransformers /workspace/ktransformers
-RUN <<EOF
-apt update -y &&  apt install -y  --no-install-recommends \
+ENV CUDA_HOME=/usr/local/cuda
+
+
+
+# 安装依赖
+RUN apt update -y
+RUN apt install -y --no-install-recommends \
+    libtbb-dev \
+    libssl-dev \
+    libcurl4-openssl-dev \
+    libaio1 \
+    libaio-dev \
+    libfmt-dev \
+    libgflags-dev \
+    zlib1g-dev \
+    patchelf \
    git \
    wget \
    vim \
    gcc \
    g++ \
-    cmake && 
-rm -rf /var/lib/apt/lists/* &&
-cd ktransformers &&
-git submodule init &&
-git submodule update &&
-pip install --upgrade pip &&
-pip install ninja pyproject numpy cpufeature &&
-pip install flash-attn &&
-CPU_INSTRUCT=${CPU_INSTRUCT}  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
-pip cache purge &&
-cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
-EOF
-
-ENTRYPOINT ["tail", "-f", "/dev/null"]
\ No newline at end of file
+    cmake
+# 拷贝代码
+RUN git clone https://github.com/kvcache-ai/ktransformers.git 
+# 清理 apt 缓存
+RUN rm -rf /var/lib/apt/lists/*
+
+# 进入项目目录
+WORKDIR /workspace/ktransformers
+# 初始化子模块
+RUN git submodule update --init --recursive
+
+# 升级 pip
+RUN pip install --upgrade pip
+
+# 安装构建依赖
+RUN pip install ninja pyproject numpy cpufeature aiohttp zmq openai
+
+# 安装 flash-attn（提前装可以避免后续某些编译依赖出错）
+RUN pip install flash-attn
+
+# 安装 ktransformers 本体（含编译）
+RUN CPU_INSTRUCT=${CPU_INSTRUCT} \
+    USE_BALANCE_SERVE=1 \
+    KTRANSFORMERS_FORCE_BUILD=TRUE \
+    TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" \
+    pip install . --no-build-isolation --verbose
+
+RUN pip install third_party/custom_flashinfer/
+# 清理 pip 缓存
+RUN pip cache purge
+
+# 拷贝 C++ 运行时库
+RUN cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
+
+# 保持容器运行（调试用）
+ENTRYPOINT ["tail", "-f", "/dev/null"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
 graft third_party
 graft ktransformers
 graft local_chat.py
+graft csrc
 include LICENSE README.md
 prune ktransformers/website
 prune ktransformers/logs
@@ -9,3 +10,4 @@ prune third_party/llama.cpp/models
 graft ktransformers/website/dist
 global-exclude __pycache__
 include KTransformersOps.*.so
+include cpuinfer_ext.*.so
--- a/Makefile
+++ b/Makefile
@@ -29,4 +29,4 @@ clean:
 install_numa:
 	USE_NUMA=1 make dev_install
 install_no_numa:
-	env -u USE_NUMA make dev_install
+	env -u USE_NUMA make dev_install
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -23,17 +23,23 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin

 <h2 id="Updates">🔥 Updates</h2>

+* **Apr 2, 2025**: Support Multi-concurrency. ([Tutorial](./doc/en/balance-serve.md)).
+
+https://github.com/user-attachments/assets/faa3bda2-928b-45a7-b44f-21e12ec84b8a
+
 * **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./doc/en/ROCm.md)).
 * **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel) for DeepSeek-V3 and R1 in 24GB VRAM.
 * **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
-* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU. 
-* **Aug 14, 2024**: Support llamfile as linear backend. 
+* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
+* **Aug 14, 2024**: Support llamfile as linear backend.
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
+
 <!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
+
 <h2 id="show-cases">🌟 Show Cases</h2>

 <div>
@@ -45,16 +51,16 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
 </p>

 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
-	- Prefill Speed (tokens/s): 
- 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
- 	- Decode Speed (tokens/s):  
- 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
-	- Upcoming Open Source Release:
-		- AMX optimizations and selective expert activation will be open-sourced in V0.3.  
-		- Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).  

+  - Prefill Speed (tokens/s):
+    - KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
+    - Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
+  - Decode Speed (tokens/s):
+    - KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
+    - Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
+  - Upcoming Open Source Release:
+    - AMX optimizations and selective expert activation will be open-sourced in V0.3.
+    - Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).
 - **Local 236B DeepSeek-Coder-V2:** Running its Q4_K_M version using only 21GB VRAM and 136GB DRAM, attainable on a local desktop machine, which scores even better than GPT4-0613 in [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench).

 <p align="center">
@@ -96,19 +102,16 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
 * **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
 -->

-
 <strong>More advanced features will coming soon, so stay tuned!</strong>

 <h2 id="quick-start">🚀 Quick Start</h2>

-
 Getting started with KTransformers is simple! Follow the steps below to set up and start using it.

 ### 📥 Installation

 To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).

-
 <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
 At the heart of KTransformers is a user-friendly, template-based injection framework. 
 This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.
@@ -167,7 +170,6 @@ The development of KTransformers is based on the flexible and versatile framewor

 KTransformers is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformers faster and easier to use.

-
 <h2 id="ack">Discussion</h2>

 If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)

--- a/csrc/balance_serve/CMakeLists.txt
+++ b/csrc/balance_serve/CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.21)
+find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
+set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
+
+# 显示选定的编译器
+message(STATUS "Using compiler: ${CMAKE_CXX_COMPILER}")
+
+
+project(balance_serve VERSION 0.1.0)
+
+set(CMAKE_CXX_STANDARD 20)
+# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
+# set(CMAKE_BUILD_TYPE "Debug")
+set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
+set(CMAKE_BUILD_TYPE "Release")
+
+file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
+
+add_custom_target(
+    format
+    COMMAND clang-format
+    -i
+    -style=file
+    ${FMT_SOURCES}
+    COMMENT "Running clang-format on all source files"
+)
+
+set(BUILD_SHARED_LIBS ON)
+set(ENABLE_PUSH OFF)
+set(ENABLE_COMPRESSION OFF)
+
+# set(CMAKE_BUILD_TYPE "Release")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
+set(THIRD_PARTY_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/third_party)
+add_subdirectory(${THIRD_PARTY_DIR}/prometheus-cpp ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp EXCLUDE_FROM_ALL)
+add_subdirectory(${THIRD_PARTY_DIR}/xxHash/cmake_unofficial ${THIRD_PARTY_BUILD_DIR}/xxHash EXCLUDE_FROM_ALL)
+
+# add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third_party/prometheus-cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/prometheus-cpp)
+set(SPDLOG_DIR ${THIRD_PARTY_DIR}/spdlog)
+set(FMT_DIR ${THIRD_PARTY_DIR}/fmt)
+
+set(KVC2_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kvc2/src)
+
+include_directories(${THIRD_PARTY_DIR})
+
+add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
+
+execute_process(
+    COMMAND python3 -c "import torch; print(torch.__path__[0])"
+    OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
+
+# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
+find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
+find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
+
+add_subdirectory(kvc2)
+add_subdirectory(sched)
+
+# add_subdirectory(test)
--- a/csrc/balance_serve/kvc2/.clang-format
+++ b/csrc/balance_serve/kvc2/.clang-format
+Language:        Cpp
+# 格式化风格，可以是LLVM, Google, Chromium, Mozilla, WebKit等，或者自定义
+BasedOnStyle:  Google
+
+# 缩进设置
+IndentWidth:        2
+TabWidth:           2
+UseTab:             Never
+
+# 换行相关设置
+BreakBeforeBraces: Attach
+AllowShortIfStatementsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+
+# 类与结构体
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# 包含文件的排序和分组
+IncludeBlocks:   Preserve
+SortIncludes:    true
+
+# 控制最大行宽
+ColumnLimit:     120
--- a/csrc/balance_serve/kvc2/CMakeLists.txt
+++ b/csrc/balance_serve/kvc2/CMakeLists.txt
+cmake_minimum_required(VERSION 3.21)
+
+find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
+set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
+
+project(kvcache-manager VERSION 0.1.0)
+
+set(CMAKE_CXX_STANDARD 20)
+
+# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -Wpedantic  -fvisibility=hidden -s")
+# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -Wpedantic -g -fsanitize=address")
+# set(CMAKE_CXX_FLAGS "-march=native -Wall -Wextra -Wpedantic -g")
+# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -g")
+set(CMAKE_BUILD_TYPE "Release")
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
+# set(CMAKE_BUILD_TYPE "Debug")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(BUILD_TEST OFF)
+set(BUILD_PYTHON_EXT OFF)
+
+# set(USE_IO_URING ON)
+if(USE_IO_URING)
+    message(STATUS "Using io_uring")
+    add_compile_definitions(USE_IO_URING)
+else()
+    message(STATUS "Using aio")
+endif()
+
+file(GLOB_RECURSE ALL_SOURCE_FILES src/*.cpp src/*.h test/*.cpp test/*.h test/*.hpp)
+
+# 添加一个自定义目标来格式化所有代码
+if(NOT TARGET format)
+    add_custom_target(
+        format
+        COMMAND clang-format
+        -i
+        -style=file
+        ${ALL_SOURCE_FILES}
+        COMMENT "Running clang-format on all source files"
+    )
+endif()
+
+execute_process(
+    COMMAND python3 -c "import torch; print(torch.__path__[0])"
+    OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
+
+# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
+find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
+find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
+
+find_package(TBB REQUIRED)
+find_package(CUDA REQUIRED)
+
+# find_package(prometheus-cpp CONFIG REQUIRED)
+if(NOT TARGET prometheus-cpp::pull)
+    message(FATAL_ERROR "prometheus-cpp::pull not found")
+else()
+    message(STATUS "prometheus Found!")
+endif()
+
+if(CUDA_FOUND)
+    message(STATUS "CUDA Found!")
+    message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
+    message(STATUS "CUDA Toolkit Root: ${CUDA_TOOLKIT_ROOT_DIR}")
+else()
+    message(FATAL_ERROR "CUDA not found!")
+endif()
+
+add_subdirectory(src)
+
+if(BUILD_TEST)
+    add_subdirectory(test)
+endif()
+
+message(STATUS "BUILD_PYTHON_EXT: ${BUILD_PYTHON_EXT}")
+
+if(BUILD_PYTHON_EXT)
+    if(NOT TARGET pybind11::pybind11)
+        add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
+    endif()
+
+    pybind11_add_module(kvc2_ext src/bind.cpp)
+
+    # EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
+    # define (VERSION_INFO) here.
+    target_compile_definitions(kvc2_ext PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
+    message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
+    target_include_directories(kvc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
+
+    target_link_libraries(kvc2_ext PUBLIC kvc2 async_store)
+
+    install(TARGETS kvc2_ext LIBRARY
+        DESTINATION ${CMAKE_BINARY_DIR}/output)
+    install(FILES src/kvc2_utils.py
+        DESTINATION ${CMAKE_BINARY_DIR}/output)
+endif()
+
--- a/csrc/balance_serve/kvc2/README.md
+++ b/csrc/balance_serve/kvc2/README.md
+# KVC2
+
+# Build
+运行以下命令编译kvc2，注意可能需要 sudo 权限安装一些依赖
+```shell
+git clone https://github.com/kvcache-ai/kvc2
+cd kvc2
+./install_deps.sh
+mkdir build
+cd build
+cmake ..
+make -j && make install
+```
+编译完成后会生成`build/output`，包含`kvc2_ext.cpython-312-x86_64-linux-gnu.so`和`kvc2_utils.py`方便调用。
+
+<!-- # Test
+运行以下命令测试kvc2，需要指定一个 disk path 作为测试目录。
+```shell
+./unit_test.sh ${DISK_PATH}
+```
+或者运行 python 的测试文件
+```shell
+python test/pytest_mem_read.py 
+``` -->
+
+# Troubleshooting
+在 Python 环境运行时，可以需要在 conda 中安装相关的依赖。
+```shell
+conda install -c conda-forge gcc_linux-64 gxx_linux-64
+```
+
+也可以尝试设置一下环境变量，然后再运行。
+```shell
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 
+```
+
+
--- a/csrc/balance_serve/kvc2/config/model_configs.json
+++ b/csrc/balance_serve/kvc2/config/model_configs.json
+{
+    "DeepSeek-Coder-V2-Instruct": {
+        "hidden_size": 5120,
+        "intermediate_size": 12288,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v2",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 60,
+        "num_key_value_heads": 128,
+        "vocab_size": 102400
+    },
+    "LLaMA-2-7B-32K": {
+        "hidden_size": 4096,
+        "intermediate_size": 11008,
+        "max_position_embeddings": 32768,
+        "model_type": "llama",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 32,
+        "vocab_size": 32000
+    },
+    "Qwen2.5-7B-Instruct": {
+        "hidden_size": 3584,
+        "intermediate_size": 18944,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 28,
+        "num_hidden_layers": 28,
+        "num_key_value_heads": 4,
+        "vocab_size": 152064
+    },
+    "qwen2-72b-instruct": {
+        "hidden_size": 8192,
+        "intermediate_size": 29568,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 64,
+        "num_hidden_layers": 80,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    }
+}
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/config/quant_configs.json
+++ b/csrc/balance_serve/kvc2/config/quant_configs.json
+{
+    "BF16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "BF16",
+        "reference": "",
+        "type_of_dot_vector": "BF16"
+    },
+    "FP16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP16",
+        "reference": "",
+        "type_of_dot_vector": "FP16"
+    },
+    "FP32": {
+        "block_element_count": 1,
+        "block_element_size": 4,
+        "bytes_per_element": 4.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP32",
+        "reference": "",
+        "type_of_dot_vector": "FP32"
+    },
+    "Q4_0": {
+        "block_element_count": 32,
+        "block_element_size": 18,
+        "bytes_per_element": 0.5625,
+        "can_be_used_as_vector": false,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q4_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    },
+    "Q8_0": {
+        "block_element_count": 32,
+        "block_element_size": 34,
+        "bytes_per_element": 1.0625,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q8_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    }
+}
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/export_envs_before_run.sh
+++ b/csrc/balance_serve/kvc2/export_envs_before_run.sh
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 
--- a/csrc/balance_serve/kvc2/install_deps.sh
+++ b/csrc/balance_serve/kvc2/install_deps.sh
+#!/bin/bash
+
+cd "${0%/*}"
+git submodule update --init --recursive
+
+sudo apt update
+sudo apt install libtbb-dev
+sudo apt install libcurl4-openssl-dev
+sudo apt install libaio-dev
+
+cd third_party/xxHash/
+make -j
+sudo make install
+cd ../..
+
--- a/csrc/balance_serve/kvc2/mkfs.sh
+++ b/csrc/balance_serve/kvc2/mkfs.sh
+sudo umount /mnt/xwy 
+sudo mkfs.xfs /dev/nvme0n1 -f
+sudo mount /dev/nvme0n1 /mnt/xwy
+sudo chown -R xwy /mnt/xwy/
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/CMakeLists.txt
+++ b/csrc/balance_serve/kvc2/src/CMakeLists.txt
+include_directories(${THIRD_PARTY_DIR}/asyncio/include)
+
+add_library(kvc2_metrics STATIC metrics.cpp)
+target_link_libraries(kvc2_metrics PUBLIC prometheus-cpp::pull)
+
+add_library(page_aligned_memory_pool page_aligned_memory_pool.cpp)
+target_include_directories(page_aligned_memory_pool PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
+
+function(add_third_party_includes TARGET_NAME)
+    target_include_directories(${TARGET_NAME} PRIVATE
+        ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/core/include
+        ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/pull/include
+        ${THIRD_PARTY_DIR}/prometheus-cpp/core/include
+        ${THIRD_PARTY_DIR}/prometheus-cpp/pull/include
+        ${THIRD_PARTY_DIR}/spdlog/include
+    )
+endfunction()
+
+
+add_library(cache_entry cache_entry.cpp)
+add_third_party_includes(cache_entry)
+target_link_libraries(cache_entry PUBLIC gpu_cache)
+
+add_library(gpu_cache gpu_cache.cpp)
+add_third_party_includes(gpu_cache)
+target_link_libraries(gpu_cache PUBLIC xxHash::xxhash ${TORCH_LIBRARIES} cuda_stream_manager)
+
+add_library(kvc2 prefix.cpp)
+target_include_directories(kvc2 PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
+add_third_party_includes(kvc2)
+target_link_libraries(kvc2 PUBLIC TBB::tbb xxHash::xxhash cache_entry cuda_stream_manager page_aligned_memory_pool ${TORCH_LIBRARIES} prometheus-cpp::pull kvc2_metrics)
+
+message(STATUS "CMAKE_SOURCE_DIR: " ${CMAKE_SOURCE_DIR})
+add_library(async_store async_store.cpp)
+target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
+target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
+target_link_libraries(async_store PUBLIC pthread)
+
+
+
+add_library(cuda_stream_manager cuda_stream_manager.cpp)
+target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/nlohmann/single_include)
+target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/spdlog/include)
+target_include_directories(cuda_stream_manager  PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(cuda_stream_manager PUBLIC CUDA::cudart)
--- a/csrc/balance_serve/kvc2/src/async_store.cpp
+++ b/csrc/balance_serve/kvc2/src/async_store.cpp
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <filesystem>
+#include <future>
+#include <iostream>
+#include <nlohmann/json.hpp>
+#include <optional>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "utils/lock_free_queue.hpp"
+
+#include "async_store.hh"
+
+namespace async_store {
+
+struct ArrayStore {
+  static const size_t DeviceBlockSize = 512;
+
+  const size_t element_size;
+  const size_t element_size_aligned;
+
+  size_t size;
+
+  size_t size_in_bytes() { return size * element_size_aligned; }
+
+  std::filesystem::path data_path;
+
+  void extend(size_t to) {
+    if (to <= size) {
+      return;
+    }
+    // TODO: extend file
+    size = to;
+    // LOG_INFO("Extend file to `, size `", to, size_in_bytes());
+  }
+
+  ArrayStore(size_t element_size, size_t size, std::filesystem::path data_path)
+      : element_size(element_size),
+        element_size_aligned((element_size + DeviceBlockSize - 1) / DeviceBlockSize),
+        data_path(data_path) {
+    // TODO: prefix cache
+  }
+
+  void read(size_t index, void* buffer) {
+    // TODO: read from file
+  }
+  void write(size_t index, void* buffer) {
+    // TODO: write to file
+  }
+};
+
+ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path) {
+  return new ArrayStore(element_size, size, data_path);
+}
+
+void close_store(ArrayStore* store) {
+  delete store;
+}
+
+size_t capacity(ArrayStore* store) {
+  return store->size;
+}
+
+void extend(ArrayStore* store, size_t to) {
+  store->extend(to);
+}
+
+template <typename T>
+struct ArrayStoreT {
+  ArrayStore store;
+  ArrayStoreT(size_t element_count, std::filesystem::path data_path) : store(sizeof(T), element_count, data_path) {}
+
+  void read(size_t index, void* output) { store.read(index, output); }
+
+  void write(size_t index, T& value) { store.write(index, &value); }
+  void write(size_t index, void* value) { store.write(index, value); }
+};
+
+std::string request_to_string(IORequest* req) {
+  return fmt::format("IOReqeust {} {} to {}[{}]", req->write ? "Write" : "Read ", req->data,
+                     req->store->data_path.c_str(), req->index);
+}
+
+struct IODealerImpl {
+  MPSCQueue<IORequest> ioQueue;
+  uint64_t io_cnt = 0;
+  size_t io_amount = 0;
+  bool use_io_uring;
+  int IO_DEPTH;
+
+  bool stop = false;
+  IODealerImpl(bool use_io_uring, int IO_DEPTH) : use_io_uring(use_io_uring), IO_DEPTH(IO_DEPTH) {}
+
+  void queue_consumer() {
+    // TODO:
+  }
+
+  void io_perf() {
+    // TODO:
+  }
+
+  void io_dealer() {
+    // TODO:
+  }
+};
+
+IODealer::IODealer(bool use_io_uring, int IO_DEPTH) {
+  io_impl = new IODealerImpl(use_io_uring, IO_DEPTH);
+}
+
+IODealer::~IODealer() {
+  stop();
+  delete io_impl;
+}
+
+void IODealer::enqueue(std::shared_ptr<IORequest> req) {
+  io_impl->ioQueue.enqueue(req);
+}
+
+std::thread IODealer::start_io_thread() {
+  return std::thread([this]() { io_impl->io_dealer(); });
+}
+void IODealer::stop() {
+  if (io_impl->stop) {
+    return;
+  }
+  // LOG_INFO("Stopping IO Dealer");
+  io_impl->stop = true;
+}
+
+}  // namespace async_store
--- a/csrc/balance_serve/kvc2/src/async_store.hh
+++ b/csrc/balance_serve/kvc2/src/async_store.hh
+#pragma once
+#include <cstddef>
+#include <filesystem>
+
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+#include "io_helper.hpp"
+
+namespace async_store {
+
+struct ArrayStore;
+
+ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path);
+void close_store(ArrayStore* store);
+size_t capacity(ArrayStore* store);
+void extend(ArrayStore* store, size_t to);
+
+
+
+struct IORequest {
+  ArrayStore* store;
+  bool write;
+  void* data;
+  size_t index;
+
+  // for sync
+  bool need_promise = false;
+  BatchPromise* promise;
+};
+
+std::string request_to_string(IORequest* req);
+
+struct IODealerImpl;
+struct IODealer {
+  IODealerImpl* io_impl;
+
+  IODealer(bool use_io_uring = false, int IO_DEPTH = 128);
+  ~IODealer();
+  IODealer(const IODealer&) = delete;
+  IODealer& operator=(const IODealer&) = delete;
+  IODealer(IODealer&&) = default;
+  IODealer& operator=(IODealer&&) = default;
+
+  void enqueue(std::shared_ptr<IORequest> req);
+  std::thread start_io_thread();
+  void stop();
+};
+
+}  // namespace async_store
--- a/csrc/balance_serve/kvc2/src/bind.cpp
+++ b/csrc/balance_serve/kvc2/src/bind.cpp
+// #include <pybind11/functional.h>
+// #include <pybind11/pybind11.h>
+// #include <pybind11/stl.h>
+// #include <memory>
+// #include <thread>
+// #include <vector>
+// #include "kvc2.h"
+// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+// #define FMT_HEADER_ONLY
+// #include "spdlog/spdlog.h"
+// #include "utils/arithmetic.hpp"
+
+// namespace py = pybind11;
+
+// PYBIND11_MODULE(kvc2_ext, m) {
+//   // Bind KVC2Config struct
+//   py::class_<kvc2::KVC2Config>(m, "KVC2Config")
+//       .def(py::init<>())
+//       .def_readwrite("path", &kvc2::KVC2Config::path)
+//       .def_readwrite("block_length", &kvc2::KVC2Config::num_token_per_page)
+//       .def_readwrite("memory_pool_size", &kvc2::KVC2Config::memory_pool_size)
+//       .def_readwrite("evict_count", &kvc2::KVC2Config::evict_count);
+
+//   // Bind CacheInfo struct
+//   py::class_<kvc2::CacheInfo>(m, "CacheInfo")
+//       .def(py::init<>())
+//       .def_readwrite("model_name", &kvc2::CacheInfo::model_name)
+//       .def_readwrite("is_key_cache", &kvc2::CacheInfo::is_key_cache)
+//       .def_readwrite("quant_type", &kvc2::CacheInfo::quant_type)
+//       .def("hidden_layer_count", &kvc2::CacheInfo::hidden_layer_count)
+//       .def("path", &kvc2::CacheInfo::path, py::arg("which_layer") = std::nullopt)
+//       .def("__eq__", &kvc2::CacheInfo::operator==)
+//       .def("element_size", &kvc2::CacheInfo::element_size)
+//       .def("hash_value", &kvc2::CacheInfo::hash_value);
+
+//   // Bind KVC2HandleInterface class
+//   py::class_<kvc2::KVC2HandleInterface, std::shared_ptr<kvc2::KVC2HandleInterface>>(m, "KVC2HandleInterface")
+//       .def("matched_length", &kvc2::SingleCacheHandleInterface::matched_length)
+//       .def("handle_data", &kvc2::KVC2HandleInterface::handle_data);
+
+//   // Bind KVC2Interface class
+//   py::class_<kvc2::KVC2Interface, std::shared_ptr<kvc2::KVC2Interface>>(m, "KVC2Interface")
+//       .def("start_io_thread", [](kvc2::KVC2Interface& self) { self.start_io_thread(); })
+//       .def("stop_io_thread", &kvc2::KVC2Interface::stop_io_thread)
+//       .def("load", &kvc2::KVC2Interface::load)
+//       .def("save", &kvc2::KVC2Interface::save)
+//       .def("raw_insert", &kvc2::KVC2Interface::raw_insert)
+//       .def("raw_read", &kvc2::KVC2Interface::raw_read)
+//       .def("lookup", &kvc2::KVC2Interface::lookup);
+
+//   // Bind create_kvc2 function
+//   m.def("create_kvc2", &kvc2::create_kvc2, py::arg("config"));
+// }
\ No newline at end of file
--- a/csrc/balance_serve/kvc2/src/cache_entry.cpp
+++ b/csrc/balance_serve/kvc2/src/cache_entry.cpp
+#include "cache_entry.hh"
+#include <mutex>
+
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
+#define FMT_HEADER_ONLY
+#include "spdlog/spdlog.h"
+
+#include "gpu_cache.hh"
+
+namespace kvc2 {
+
+bool ConcurrentControlUnit::can_desert() {
+  if (ref_count.load() == 0 && dirty.load() == false) {
+    tc.reset();
+    return true;
+  } else {
+    return false;
+  }
+}
+void ConcurrentControlUnit::debug() {
+  SPDLOG_DEBUG("ref count {}, dirty {}, {}", ref_count.load(), dirty.load(), tc.debug());
+}
+
+CacheBlockEntry::~CacheBlockEntry() {
+  if (data != nullptr && manager && manager->pool) {
+    SPDLOG_WARN("Free {} when destruct", data);
+    free_on_cpu();
+  }
+}
+
+bool CacheBlockEntry::alloc_on_cpu() {
+  assert(data == nullptr);
+  data = manager->pool->alloc(size);
+  if (data == nullptr) {
+    manager->evict_for_cpu_cache();
+    data = manager->pool->alloc(size);
+    if (data == nullptr) {
+      SPDLOG_ERROR("Not enough memory for Block Cache");
+      return false;
+    }
+  }
+  return true;
+}
+
+void CacheBlockEntry::free_on_cpu() {
+  manager->pool->free(data, size);
+  data = nullptr;
+}
+
+bool CacheBlockEntry::alloc_on_cpu_no_lock() {
+  if (data == nullptr) {
+    if (alloc_on_cpu() == false) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool CacheBlockEntry::inc_ref_or_alloc_on_cpu() {
+  std::lock_guard<CacheBlockEntry::MutexT> lg(lock);
+  if (data == nullptr) {
+    if (alloc_on_cpu()) {
+      cpu_cc.ref_count.fetch_add(1);
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    cpu_cc.ref_count.fetch_add(1);
+    return true;
+  }
+}
+
+std::unique_lock<CacheBlockEntry::MutexT> CacheBlockEntry::try_lock() {
+  return std::unique_lock<CacheBlockEntry::MutexT>(lock, std::try_to_lock);
+}
+
+std::lock_guard<CacheBlockEntry::MutexT> CacheBlockEntry::lock_guard() {
+  return std::lock_guard<CacheBlockEntry::MutexT>(lock);
+}
+
+void CacheBlockEntry::debug() {
+  SPDLOG_DEBUG(
+      "CacheBlockEntry: disk[{:4},{:7}], with key {}, hash {:016x}, data: {}, ref_count: {}, size: {}, cpu tc: {}, "
+      "in page cache: {}, gpu ref count:{}, gpu tc: {}",
+      layer, idx, with_key, hash, data, cpu_cc.ref_count.load(), size, cpu_cc.tc.debug(), manager != nullptr,
+      gpu_cc.ref_count.load(), gpu_cc.tc.debug());
+}
+
+CacheBlockEntryCollector::CacheBlockEntryCollector(std::function<void(CacheBlockEntry*)> exit_fn) : exit_fn(exit_fn) {}
+
+CacheBlockEntryCollector::~CacheBlockEntryCollector() {
+  // SPDLOG_DEBUG("Collector Destruct");
+  for (auto& e : entries) {
+    exit_fn(e);
+  }
+}
+
+void CacheBlockEntry::io_with(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper,
+                              async_store::ArrayStore* store, size_t layer, size_t index, IOOption option) {
+  bool write;
+
+  auto& batch_promise = io_helper.batch_promise;
+
+  switch (option) {
+    case IO_Read: {
+      write = false;
+      if (io_helper.absorb_tc(this, cpu_cc.tc)) {
+        // need read
+      } else {
+        return;
+      }
+      break;
+    }
+    case IO_ForceRead: {
+      // Not change
+      write = false;
+      break;
+    }
+    case IO_ForceWrite: {
+      // Not change
+      write = true;
+      break;
+    }
+    case IO_Write: {
+      write = true;
+      break;
+    }
+    default: {
+      assert(0);
+    }
+  }
+  io_helper.new_task();
+  this->layer = layer;
+  this->idx = index;
+
+  auto req = std::make_shared<async_store::IORequest>();
+  req->store = store;
+  req->data = data;
+  req->index = index;
+  req->write = write;
+  req->need_promise = true;
+  req->promise = &batch_promise;
+
+  SPDLOG_TRACE("Submitting {}", async_store::request_to_string(req.get()));
+  dealer->enqueue(std::move(req));
+}
+
+CacheEntryManager::CacheEntryManager(CacheEntryManagerConfig config) : config(config) {}
+
+void CacheEntryManager::evict_for_cpu_cache() {
+  size_t count = 0;
+  evict(
+      [&count](const BlockPtr& block) {
+        // here we assume each with gpu must resides on cpu
+        if (block->data != nullptr && block->cpu_cc.can_desert() &&
+            block->gpu_cc.can_desert() /*For now If A Cache Entry Block is on GPU, it must on cpu. */) {
+          block->free_on_cpu();
+          count += 1;
+          return true;
+        } else {
+          return false;
+        }
+      },
+      [&count, this]() {
+        return false;
+        // return count == this->config.evict_count;
+      });
+}
+
+void CacheEntryManager::insert(BlockPtr entry) {
+  assert(entry->with_key);
+  assert(key_entry_map.count(entry->hash) == 0);
+  usage_list.push_front(entry);
+  key_entry_map[entry->hash] = usage_list.begin();
+}
+
+CacheEntryManager::BlockPtr CacheEntryManager::access(const Key& key) {
+  auto it = key_entry_map.at(key);
+  auto entry = *it;
+  usage_list.erase(it);
+  usage_list.push_front(entry);
+  key_entry_map[key] = usage_list.begin();
+  return entry;
+}
+
+// void CacheEntryManager::remove(const Key& key) {
+//   auto it = key_entry_map[key];
+//   usage_list.erase(it);
+//   key_entry_map.erase(key);
+// }
+
+void CacheEntryManager::evict(std::function<bool(const BlockPtr&)> filter, std::function<bool()> stop_condition) {
+  auto evict_count = 0;
+  auto inspect_count = 0;
+
+  std::lock_guard<std::mutex> lg(lock);
+  for (auto it = usage_list.rbegin(); it != usage_list.rend();) {
+    inspect_count += 1;
+    // SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
+    //              usage_list.size(), evict_count, inspect_count, pool->debug());
+    // (*it)->debug();
+    if (stop_condition())
+      break;
+    auto entry_ul = (*it)->try_lock();
+    if (entry_ul.owns_lock() == false) {
+      ++it;  // Ensure iterator advances when locking fails
+      continue;
+    }
+    if (filter(*it)) {
+      // SPDLOG_DEBUG("Evicting {}", fmt::ptr(it->get()));
+      evict_count++;
+      if ((*it)->with_key)
+        key_entry_map.erase((*it)->hash);
+      it = decltype(it)(usage_list.erase(std::next(it).base()));  // Use base() to adjust for reverse iterator
+    } else {
+      ++it;  // Ensure iterator advances when filter fails
+    }
+  }
+
+  if (evict_count > 0) {
+    SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
+                 usage_list.size(), evict_count, inspect_count, pool->debug());
+  }
+}
+
+CacheEntryManager::BlockPtr CacheEntryManager::get(bool& is_new, size_t size, std::optional<Key> key) {
+  std::unique_lock<std::mutex> ul(lock);
+  if (key.has_value()) {
+    if (key_entry_map.count(key.value())) {
+      is_new = false;
+      return access(key.value());
+    } else {
+      auto entry = std::make_shared<CacheBlockEntry>();
+      entry->with_key = true;
+      entry->hash = key.value();
+      entry->size = size;
+      entry->manager = this;
+      insert(entry);
+      is_new = true;
+      return entry;
+    }
+  } else {
+    auto entry = std::make_shared<CacheBlockEntry>();
+    entry->with_key = false;
+    entry->size = size;
+    entry->manager = this;
+    is_new = true;
+    return entry;
+  }
+}
+
+void CacheEntryManager::debug() {
+  fmt::print("Cache Manager: {} entries\n", key_entry_map.size());
+  pool->debug();
+  fmt::print("Layer 0 Entries in Order\n", key_entry_map.size());
+  for (auto& it : usage_list) {
+    if (it->layer == 0)
+      it->debug();
+  }
+}
+
+};  // namespace kvc2