Unverified Commit 877aec85 authored by Yuhao Tsui's avatar Yuhao Tsui Committed by GitHub
Browse files

Merge branch 'kvcache-ai:main' into main

parents 84164f58 9037bf30
......@@ -163,6 +163,8 @@ jobs:
- name: build for cuda
if: matrix.cuda != ''
env:
USE_BALANCE_SERVE: "1"
run: |
git submodule init
git submodule update
......
......@@ -4,3 +4,16 @@
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "third_party/spdlog"]
path = third_party/spdlog
url = https://github.com/gabime/spdlog.git
[submodule "third_party/custom_flashinfer"]
path = third_party/custom_flashinfer
url = https://github.com/kvcache-ai/custom_flashinfer.git
branch = fix-precision-mla-merge-main
[submodule "third_party/xxHash"]
path = third_party/xxHash
url = https://github.com/Cyan4973/xxHash.git
[submodule "third_party/prometheus-cpp"]
path = third_party/prometheus-cpp
url = https://github.com/jupp0r/prometheus-cpp
FROM node:20.16.0 as web_compile
WORKDIR /home
RUN <<EOF
git clone https://github.com/kvcache-ai/ktransformers.git &&
cd ktransformers/ktransformers/website/ &&
npm install @vue/cli &&
npm run build &&
rm -rf node_modules
EOF
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
ARG CPU_INSTRUCT=NATIVE
# 设置工作目录和 CUDA 路径
WORKDIR /workspace
ENV CUDA_HOME /usr/local/cuda
COPY --from=web_compile /home/ktransformers /workspace/ktransformers
RUN <<EOF
apt update -y && apt install -y --no-install-recommends \
ENV CUDA_HOME=/usr/local/cuda
# 安装依赖
RUN apt update -y
RUN apt install -y --no-install-recommends \
libtbb-dev \
libssl-dev \
libcurl4-openssl-dev \
libaio1 \
libaio-dev \
libfmt-dev \
libgflags-dev \
zlib1g-dev \
patchelf \
git \
wget \
vim \
gcc \
g++ \
cmake &&
rm -rf /var/lib/apt/lists/* &&
cd ktransformers &&
git submodule init &&
git submodule update &&
pip install --upgrade pip &&
pip install ninja pyproject numpy cpufeature &&
pip install flash-attn &&
CPU_INSTRUCT=${CPU_INSTRUCT} KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
pip cache purge &&
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
EOF
ENTRYPOINT ["tail", "-f", "/dev/null"]
\ No newline at end of file
cmake
# 拷贝代码
RUN git clone https://github.com/kvcache-ai/ktransformers.git
# 清理 apt 缓存
RUN rm -rf /var/lib/apt/lists/*
# 进入项目目录
WORKDIR /workspace/ktransformers
# 初始化子模块
RUN git submodule update --init --recursive
# 升级 pip
RUN pip install --upgrade pip
# 安装构建依赖
RUN pip install ninja pyproject numpy cpufeature aiohttp zmq openai
# 安装 flash-attn(提前装可以避免后续某些编译依赖出错)
RUN pip install flash-attn
# 安装 ktransformers 本体(含编译)
RUN CPU_INSTRUCT=${CPU_INSTRUCT} \
USE_BALANCE_SERVE=1 \
KTRANSFORMERS_FORCE_BUILD=TRUE \
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" \
pip install . --no-build-isolation --verbose
RUN pip install third_party/custom_flashinfer/
# 清理 pip 缓存
RUN pip cache purge
# 拷贝 C++ 运行时库
RUN cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
# 保持容器运行(调试用)
ENTRYPOINT ["tail", "-f", "/dev/null"]
graft third_party
graft ktransformers
graft local_chat.py
graft csrc
include LICENSE README.md
prune ktransformers/website
prune ktransformers/logs
......@@ -9,3 +10,4 @@ prune third_party/llama.cpp/models
graft ktransformers/website/dist
global-exclude __pycache__
include KTransformersOps.*.so
include cpuinfer_ext.*.so
......@@ -29,4 +29,4 @@ clean:
install_numa:
USE_NUMA=1 make dev_install
install_no_numa:
env -u USE_NUMA make dev_install
env -u USE_NUMA make dev_install
\ No newline at end of file
......@@ -23,17 +23,23 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
<h2 id="Updates">🔥 Updates</h2>
* **Apr 2, 2025**: Support Multi-concurrency. ([Tutorial](./doc/en/balance-serve.md)).
https://github.com/user-attachments/assets/faa3bda2-928b-45a7-b44f-21e12ec84b8a
* **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./doc/en/ROCm.md)).
* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel) for DeepSeek-V3 and R1 in 24GB VRAM.
* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed (+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
* **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
* **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
* **Aug 9, 2024**: Support windows native.
<!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
<h2 id="show-cases">🌟 Show Cases</h2>
<div>
......@@ -45,16 +51,16 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
</p>
- **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
- Prefill Speed (tokens/s):
- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
- Decode Speed (tokens/s):
- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
- Upcoming Open Source Release:
- AMX optimizations and selective expert activation will be open-sourced in V0.3.
- Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).
- Prefill Speed (tokens/s):
- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
- Decode Speed (tokens/s):
- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
- Upcoming Open Source Release:
- AMX optimizations and selective expert activation will be open-sourced in V0.3.
- Currently available only in preview binary distribution, which can be downloaded [here](./doc/en/DeepseekR1_V3_tutorial.md).
- **Local 236B DeepSeek-Coder-V2:** Running its Q4_K_M version using only 21GB VRAM and 136GB DRAM, attainable on a local desktop machine, which scores even better than GPT4-0613 in [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench).
<p align="center">
......@@ -96,19 +102,16 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
* **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
-->
<strong>More advanced features will coming soon, so stay tuned!</strong>
<h2 id="quick-start">🚀 Quick Start</h2>
Getting started with KTransformers is simple! Follow the steps below to set up and start using it.
### 📥 Installation
To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
<h2 id="tutorial">📃 Brief Injection Tutorial</h2>
At the heart of KTransformers is a user-friendly, template-based injection framework.
This allows researchers to easily replace original torch modules with optimized variants. It also simplifies the process of combining multiple optimizations, allowing the exploration of their synergistic effects.
......@@ -167,7 +170,6 @@ The development of KTransformers is based on the flexible and versatile framewor
KTransformers is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformers faster and easier to use.
<h2 id="ack">Discussion</h2>
If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)
......
cmake_minimum_required(VERSION 3.21)
find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
# 显示选定的编译器
message(STATUS "Using compiler: ${CMAKE_CXX_COMPILER}")
project(balance_serve VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 20)
# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
# set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
set(CMAKE_BUILD_TYPE "Release")
file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
add_custom_target(
format
COMMAND clang-format
-i
-style=file
${FMT_SOURCES}
COMMENT "Running clang-format on all source files"
)
set(BUILD_SHARED_LIBS ON)
set(ENABLE_PUSH OFF)
set(ENABLE_COMPRESSION OFF)
# set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
set(THIRD_PARTY_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/third_party)
add_subdirectory(${THIRD_PARTY_DIR}/prometheus-cpp ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp EXCLUDE_FROM_ALL)
add_subdirectory(${THIRD_PARTY_DIR}/xxHash/cmake_unofficial ${THIRD_PARTY_BUILD_DIR}/xxHash EXCLUDE_FROM_ALL)
# add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third_party/prometheus-cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/prometheus-cpp)
set(SPDLOG_DIR ${THIRD_PARTY_DIR}/spdlog)
set(FMT_DIR ${THIRD_PARTY_DIR}/fmt)
set(KVC2_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kvc2/src)
include_directories(${THIRD_PARTY_DIR})
add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
execute_process(
COMMAND python3 -c "import torch; print(torch.__path__[0])"
OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
add_subdirectory(kvc2)
add_subdirectory(sched)
# add_subdirectory(test)
Language: Cpp
# 格式化风格,可以是LLVM, Google, Chromium, Mozilla, WebKit等,或者自定义
BasedOnStyle: Google
# 缩进设置
IndentWidth: 2
TabWidth: 2
UseTab: Never
# 换行相关设置
BreakBeforeBraces: Attach
AllowShortIfStatementsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
# 类与结构体
DerivePointerAlignment: false
PointerAlignment: Left
# 包含文件的排序和分组
IncludeBlocks: Preserve
SortIncludes: true
# 控制最大行宽
ColumnLimit: 120
cmake_minimum_required(VERSION 3.21)
find_program(GCC_COMPILER NAMES g++-13 g++-12 g++-11 REQUIRED)
set(CMAKE_CXX_COMPILER ${GCC_COMPILER})
project(kvcache-manager VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 20)
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -Wpedantic -fvisibility=hidden -s")
# set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -Wpedantic -g -fsanitize=address")
# set(CMAKE_CXX_FLAGS "-march=native -Wall -Wextra -Wpedantic -g")
# set(CMAKE_CXX_FLAGS "-fPIC -O3 -ffast-math -march=native -Wall -Wextra -g")
set(CMAKE_BUILD_TYPE "Release")
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
# set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(BUILD_TEST OFF)
set(BUILD_PYTHON_EXT OFF)
# set(USE_IO_URING ON)
if(USE_IO_URING)
message(STATUS "Using io_uring")
add_compile_definitions(USE_IO_URING)
else()
message(STATUS "Using aio")
endif()
file(GLOB_RECURSE ALL_SOURCE_FILES src/*.cpp src/*.h test/*.cpp test/*.h test/*.hpp)
# 添加一个自定义目标来格式化所有代码
if(NOT TARGET format)
add_custom_target(
format
COMMAND clang-format
-i
-style=file
${ALL_SOURCE_FILES}
COMMENT "Running clang-format on all source files"
)
endif()
execute_process(
COMMAND python3 -c "import torch; print(torch.__path__[0])"
OUTPUT_VARIABLE TORCH_INSTALL_PREFIX
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "Found PyTorch at: ${TORCH_INSTALL_PREFIX}")
# set(TORCH_INSTALL_PREFIX "/home/xwy/.conda/envs/kvc/lib/python3.12/site-packages/torch")
find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
find_package(Torch REQUIRED PATHS "${TORCH_INSTALL_PREFIX}/share/cmake/Torch" NO_DEFAULT_PATH)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
find_package(TBB REQUIRED)
find_package(CUDA REQUIRED)
# find_package(prometheus-cpp CONFIG REQUIRED)
if(NOT TARGET prometheus-cpp::pull)
message(FATAL_ERROR "prometheus-cpp::pull not found")
else()
message(STATUS "prometheus Found!")
endif()
if(CUDA_FOUND)
message(STATUS "CUDA Found!")
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
message(STATUS "CUDA Toolkit Root: ${CUDA_TOOLKIT_ROOT_DIR}")
else()
message(FATAL_ERROR "CUDA not found!")
endif()
add_subdirectory(src)
if(BUILD_TEST)
add_subdirectory(test)
endif()
message(STATUS "BUILD_PYTHON_EXT: ${BUILD_PYTHON_EXT}")
if(BUILD_PYTHON_EXT)
if(NOT TARGET pybind11::pybind11)
add_subdirectory(${THIRD_PARTY_DIR}/pybind11 ${THIRD_PARTY_BUILD_DIR}/pybind11)
endif()
pybind11_add_module(kvc2_ext src/bind.cpp)
# EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
# define (VERSION_INFO) here.
target_compile_definitions(kvc2_ext PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
target_include_directories(kvc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
target_link_libraries(kvc2_ext PUBLIC kvc2 async_store)
install(TARGETS kvc2_ext LIBRARY
DESTINATION ${CMAKE_BINARY_DIR}/output)
install(FILES src/kvc2_utils.py
DESTINATION ${CMAKE_BINARY_DIR}/output)
endif()
# KVC2
# Build
运行以下命令编译kvc2,注意可能需要 sudo 权限安装一些依赖
```shell
git clone https://github.com/kvcache-ai/kvc2
cd kvc2
./install_deps.sh
mkdir build
cd build
cmake ..
make -j && make install
```
编译完成后会生成`build/output`,包含`kvc2_ext.cpython-312-x86_64-linux-gnu.so``kvc2_utils.py`方便调用。
<!-- # Test
运行以下命令测试kvc2,需要指定一个 disk path 作为测试目录。
```shell
./unit_test.sh ${DISK_PATH}
```
或者运行 python 的测试文件
```shell
python test/pytest_mem_read.py
``` -->
# Troubleshooting
在 Python 环境运行时,可以需要在 conda 中安装相关的依赖。
```shell
conda install -c conda-forge gcc_linux-64 gxx_linux-64
```
也可以尝试设置一下环境变量,然后再运行。
```shell
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7
```
{
"DeepSeek-Coder-V2-Instruct": {
"hidden_size": 5120,
"intermediate_size": 12288,
"max_position_embeddings": 163840,
"model_type": "deepseek_v2",
"num_attention_heads": 128,
"num_hidden_layers": 60,
"num_key_value_heads": 128,
"vocab_size": 102400
},
"LLaMA-2-7B-32K": {
"hidden_size": 4096,
"intermediate_size": 11008,
"max_position_embeddings": 32768,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 32,
"vocab_size": 32000
},
"Qwen2.5-7B-Instruct": {
"hidden_size": 3584,
"intermediate_size": 18944,
"max_position_embeddings": 32768,
"model_type": "qwen2",
"num_attention_heads": 28,
"num_hidden_layers": 28,
"num_key_value_heads": 4,
"vocab_size": 152064
},
"qwen2-72b-instruct": {
"hidden_size": 8192,
"intermediate_size": 29568,
"max_position_embeddings": 32768,
"model_type": "qwen2",
"num_attention_heads": 64,
"num_hidden_layers": 80,
"num_key_value_heads": 8,
"vocab_size": 152064
}
}
\ No newline at end of file
{
"BF16": {
"block_element_count": 1,
"block_element_size": 2,
"bytes_per_element": 2.0,
"can_be_used_as_vector": true,
"has_min": false,
"has_scale": false,
"name": "BF16",
"reference": "",
"type_of_dot_vector": "BF16"
},
"FP16": {
"block_element_count": 1,
"block_element_size": 2,
"bytes_per_element": 2.0,
"can_be_used_as_vector": true,
"has_min": false,
"has_scale": false,
"name": "FP16",
"reference": "",
"type_of_dot_vector": "FP16"
},
"FP32": {
"block_element_count": 1,
"block_element_size": 4,
"bytes_per_element": 4.0,
"can_be_used_as_vector": true,
"has_min": false,
"has_scale": false,
"name": "FP32",
"reference": "",
"type_of_dot_vector": "FP32"
},
"Q4_0": {
"block_element_count": 32,
"block_element_size": 18,
"bytes_per_element": 0.5625,
"can_be_used_as_vector": false,
"has_min": false,
"has_scale": true,
"name": "Q4_0",
"reference": "https://huggingface.co/docs/hub/gguf",
"type_of_dot_vector": "Q8_0"
},
"Q8_0": {
"block_element_count": 32,
"block_element_size": 34,
"bytes_per_element": 1.0625,
"can_be_used_as_vector": true,
"has_min": false,
"has_scale": true,
"name": "Q8_0",
"reference": "https://huggingface.co/docs/hub/gguf",
"type_of_dot_vector": "Q8_0"
}
}
\ No newline at end of file
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7
#!/bin/bash
cd "${0%/*}"
git submodule update --init --recursive
sudo apt update
sudo apt install libtbb-dev
sudo apt install libcurl4-openssl-dev
sudo apt install libaio-dev
cd third_party/xxHash/
make -j
sudo make install
cd ../..
sudo umount /mnt/xwy
sudo mkfs.xfs /dev/nvme0n1 -f
sudo mount /dev/nvme0n1 /mnt/xwy
sudo chown -R xwy /mnt/xwy/
\ No newline at end of file
include_directories(${THIRD_PARTY_DIR}/asyncio/include)
add_library(kvc2_metrics STATIC metrics.cpp)
target_link_libraries(kvc2_metrics PUBLIC prometheus-cpp::pull)
add_library(page_aligned_memory_pool page_aligned_memory_pool.cpp)
target_include_directories(page_aligned_memory_pool PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
function(add_third_party_includes TARGET_NAME)
target_include_directories(${TARGET_NAME} PRIVATE
${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/core/include
${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/pull/include
${THIRD_PARTY_DIR}/prometheus-cpp/core/include
${THIRD_PARTY_DIR}/prometheus-cpp/pull/include
${THIRD_PARTY_DIR}/spdlog/include
)
endfunction()
add_library(cache_entry cache_entry.cpp)
add_third_party_includes(cache_entry)
target_link_libraries(cache_entry PUBLIC gpu_cache)
add_library(gpu_cache gpu_cache.cpp)
add_third_party_includes(gpu_cache)
target_link_libraries(gpu_cache PUBLIC xxHash::xxhash ${TORCH_LIBRARIES} cuda_stream_manager)
add_library(kvc2 prefix.cpp)
target_include_directories(kvc2 PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
add_third_party_includes(kvc2)
target_link_libraries(kvc2 PUBLIC TBB::tbb xxHash::xxhash cache_entry cuda_stream_manager page_aligned_memory_pool ${TORCH_LIBRARIES} prometheus-cpp::pull kvc2_metrics)
message(STATUS "CMAKE_SOURCE_DIR: " ${CMAKE_SOURCE_DIR})
add_library(async_store async_store.cpp)
target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
target_link_libraries(async_store PUBLIC pthread)
add_library(cuda_stream_manager cuda_stream_manager.cpp)
target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/nlohmann/single_include)
target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/spdlog/include)
target_include_directories(cuda_stream_manager PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(cuda_stream_manager PUBLIC CUDA::cudart)
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <filesystem>
#include <future>
#include <iostream>
#include <nlohmann/json.hpp>
#include <optional>
#include <queue>
#include <thread>
#include <unordered_map>
#include "utils/lock_free_queue.hpp"
#include "async_store.hh"
namespace async_store {
struct ArrayStore {
static const size_t DeviceBlockSize = 512;
const size_t element_size;
const size_t element_size_aligned;
size_t size;
size_t size_in_bytes() { return size * element_size_aligned; }
std::filesystem::path data_path;
void extend(size_t to) {
if (to <= size) {
return;
}
// TODO: extend file
size = to;
// LOG_INFO("Extend file to `, size `", to, size_in_bytes());
}
ArrayStore(size_t element_size, size_t size, std::filesystem::path data_path)
: element_size(element_size),
element_size_aligned((element_size + DeviceBlockSize - 1) / DeviceBlockSize),
data_path(data_path) {
// TODO: prefix cache
}
void read(size_t index, void* buffer) {
// TODO: read from file
}
void write(size_t index, void* buffer) {
// TODO: write to file
}
};
ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path) {
return new ArrayStore(element_size, size, data_path);
}
void close_store(ArrayStore* store) {
delete store;
}
size_t capacity(ArrayStore* store) {
return store->size;
}
void extend(ArrayStore* store, size_t to) {
store->extend(to);
}
template <typename T>
struct ArrayStoreT {
ArrayStore store;
ArrayStoreT(size_t element_count, std::filesystem::path data_path) : store(sizeof(T), element_count, data_path) {}
void read(size_t index, void* output) { store.read(index, output); }
void write(size_t index, T& value) { store.write(index, &value); }
void write(size_t index, void* value) { store.write(index, value); }
};
std::string request_to_string(IORequest* req) {
return fmt::format("IOReqeust {} {} to {}[{}]", req->write ? "Write" : "Read ", req->data,
req->store->data_path.c_str(), req->index);
}
struct IODealerImpl {
MPSCQueue<IORequest> ioQueue;
uint64_t io_cnt = 0;
size_t io_amount = 0;
bool use_io_uring;
int IO_DEPTH;
bool stop = false;
IODealerImpl(bool use_io_uring, int IO_DEPTH) : use_io_uring(use_io_uring), IO_DEPTH(IO_DEPTH) {}
void queue_consumer() {
// TODO:
}
void io_perf() {
// TODO:
}
void io_dealer() {
// TODO:
}
};
IODealer::IODealer(bool use_io_uring, int IO_DEPTH) {
io_impl = new IODealerImpl(use_io_uring, IO_DEPTH);
}
IODealer::~IODealer() {
stop();
delete io_impl;
}
void IODealer::enqueue(std::shared_ptr<IORequest> req) {
io_impl->ioQueue.enqueue(req);
}
std::thread IODealer::start_io_thread() {
return std::thread([this]() { io_impl->io_dealer(); });
}
void IODealer::stop() {
if (io_impl->stop) {
return;
}
// LOG_INFO("Stopping IO Dealer");
io_impl->stop = true;
}
} // namespace async_store
#pragma once
#include <cstddef>
#include <filesystem>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "io_helper.hpp"
namespace async_store {
struct ArrayStore;
ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path);
void close_store(ArrayStore* store);
size_t capacity(ArrayStore* store);
void extend(ArrayStore* store, size_t to);
struct IORequest {
ArrayStore* store;
bool write;
void* data;
size_t index;
// for sync
bool need_promise = false;
BatchPromise* promise;
};
std::string request_to_string(IORequest* req);
struct IODealerImpl;
struct IODealer {
IODealerImpl* io_impl;
IODealer(bool use_io_uring = false, int IO_DEPTH = 128);
~IODealer();
IODealer(const IODealer&) = delete;
IODealer& operator=(const IODealer&) = delete;
IODealer(IODealer&&) = default;
IODealer& operator=(IODealer&&) = default;
void enqueue(std::shared_ptr<IORequest> req);
std::thread start_io_thread();
void stop();
};
} // namespace async_store
// #include <pybind11/functional.h>
// #include <pybind11/pybind11.h>
// #include <pybind11/stl.h>
// #include <memory>
// #include <thread>
// #include <vector>
// #include "kvc2.h"
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
// #define FMT_HEADER_ONLY
// #include "spdlog/spdlog.h"
// #include "utils/arithmetic.hpp"
// namespace py = pybind11;
// PYBIND11_MODULE(kvc2_ext, m) {
// // Bind KVC2Config struct
// py::class_<kvc2::KVC2Config>(m, "KVC2Config")
// .def(py::init<>())
// .def_readwrite("path", &kvc2::KVC2Config::path)
// .def_readwrite("block_length", &kvc2::KVC2Config::num_token_per_page)
// .def_readwrite("memory_pool_size", &kvc2::KVC2Config::memory_pool_size)
// .def_readwrite("evict_count", &kvc2::KVC2Config::evict_count);
// // Bind CacheInfo struct
// py::class_<kvc2::CacheInfo>(m, "CacheInfo")
// .def(py::init<>())
// .def_readwrite("model_name", &kvc2::CacheInfo::model_name)
// .def_readwrite("is_key_cache", &kvc2::CacheInfo::is_key_cache)
// .def_readwrite("quant_type", &kvc2::CacheInfo::quant_type)
// .def("hidden_layer_count", &kvc2::CacheInfo::hidden_layer_count)
// .def("path", &kvc2::CacheInfo::path, py::arg("which_layer") = std::nullopt)
// .def("__eq__", &kvc2::CacheInfo::operator==)
// .def("element_size", &kvc2::CacheInfo::element_size)
// .def("hash_value", &kvc2::CacheInfo::hash_value);
// // Bind KVC2HandleInterface class
// py::class_<kvc2::KVC2HandleInterface, std::shared_ptr<kvc2::KVC2HandleInterface>>(m, "KVC2HandleInterface")
// .def("matched_length", &kvc2::SingleCacheHandleInterface::matched_length)
// .def("handle_data", &kvc2::KVC2HandleInterface::handle_data);
// // Bind KVC2Interface class
// py::class_<kvc2::KVC2Interface, std::shared_ptr<kvc2::KVC2Interface>>(m, "KVC2Interface")
// .def("start_io_thread", [](kvc2::KVC2Interface& self) { self.start_io_thread(); })
// .def("stop_io_thread", &kvc2::KVC2Interface::stop_io_thread)
// .def("load", &kvc2::KVC2Interface::load)
// .def("save", &kvc2::KVC2Interface::save)
// .def("raw_insert", &kvc2::KVC2Interface::raw_insert)
// .def("raw_read", &kvc2::KVC2Interface::raw_read)
// .def("lookup", &kvc2::KVC2Interface::lookup);
// // Bind create_kvc2 function
// m.def("create_kvc2", &kvc2::create_kvc2, py::arg("config"));
// }
\ No newline at end of file
#include "cache_entry.hh"
#include <mutex>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "gpu_cache.hh"
namespace kvc2 {
bool ConcurrentControlUnit::can_desert() {
if (ref_count.load() == 0 && dirty.load() == false) {
tc.reset();
return true;
} else {
return false;
}
}
void ConcurrentControlUnit::debug() {
SPDLOG_DEBUG("ref count {}, dirty {}, {}", ref_count.load(), dirty.load(), tc.debug());
}
CacheBlockEntry::~CacheBlockEntry() {
if (data != nullptr && manager && manager->pool) {
SPDLOG_WARN("Free {} when destruct", data);
free_on_cpu();
}
}
bool CacheBlockEntry::alloc_on_cpu() {
assert(data == nullptr);
data = manager->pool->alloc(size);
if (data == nullptr) {
manager->evict_for_cpu_cache();
data = manager->pool->alloc(size);
if (data == nullptr) {
SPDLOG_ERROR("Not enough memory for Block Cache");
return false;
}
}
return true;
}
void CacheBlockEntry::free_on_cpu() {
manager->pool->free(data, size);
data = nullptr;
}
bool CacheBlockEntry::alloc_on_cpu_no_lock() {
if (data == nullptr) {
if (alloc_on_cpu() == false) {
return false;
}
}
return true;
}
bool CacheBlockEntry::inc_ref_or_alloc_on_cpu() {
std::lock_guard<CacheBlockEntry::MutexT> lg(lock);
if (data == nullptr) {
if (alloc_on_cpu()) {
cpu_cc.ref_count.fetch_add(1);
return true;
} else {
return false;
}
} else {
cpu_cc.ref_count.fetch_add(1);
return true;
}
}
std::unique_lock<CacheBlockEntry::MutexT> CacheBlockEntry::try_lock() {
return std::unique_lock<CacheBlockEntry::MutexT>(lock, std::try_to_lock);
}
std::lock_guard<CacheBlockEntry::MutexT> CacheBlockEntry::lock_guard() {
return std::lock_guard<CacheBlockEntry::MutexT>(lock);
}
void CacheBlockEntry::debug() {
SPDLOG_DEBUG(
"CacheBlockEntry: disk[{:4},{:7}], with key {}, hash {:016x}, data: {}, ref_count: {}, size: {}, cpu tc: {}, "
"in page cache: {}, gpu ref count:{}, gpu tc: {}",
layer, idx, with_key, hash, data, cpu_cc.ref_count.load(), size, cpu_cc.tc.debug(), manager != nullptr,
gpu_cc.ref_count.load(), gpu_cc.tc.debug());
}
CacheBlockEntryCollector::CacheBlockEntryCollector(std::function<void(CacheBlockEntry*)> exit_fn) : exit_fn(exit_fn) {}
CacheBlockEntryCollector::~CacheBlockEntryCollector() {
// SPDLOG_DEBUG("Collector Destruct");
for (auto& e : entries) {
exit_fn(e);
}
}
void CacheBlockEntry::io_with(async_store::IODealer* dealer, IO_Helper<CacheBlockEntry>& io_helper,
async_store::ArrayStore* store, size_t layer, size_t index, IOOption option) {
bool write;
auto& batch_promise = io_helper.batch_promise;
switch (option) {
case IO_Read: {
write = false;
if (io_helper.absorb_tc(this, cpu_cc.tc)) {
// need read
} else {
return;
}
break;
}
case IO_ForceRead: {
// Not change
write = false;
break;
}
case IO_ForceWrite: {
// Not change
write = true;
break;
}
case IO_Write: {
write = true;
break;
}
default: {
assert(0);
}
}
io_helper.new_task();
this->layer = layer;
this->idx = index;
auto req = std::make_shared<async_store::IORequest>();
req->store = store;
req->data = data;
req->index = index;
req->write = write;
req->need_promise = true;
req->promise = &batch_promise;
SPDLOG_TRACE("Submitting {}", async_store::request_to_string(req.get()));
dealer->enqueue(std::move(req));
}
CacheEntryManager::CacheEntryManager(CacheEntryManagerConfig config) : config(config) {}
void CacheEntryManager::evict_for_cpu_cache() {
size_t count = 0;
evict(
[&count](const BlockPtr& block) {
// here we assume each with gpu must resides on cpu
if (block->data != nullptr && block->cpu_cc.can_desert() &&
block->gpu_cc.can_desert() /*For now If A Cache Entry Block is on GPU, it must on cpu. */) {
block->free_on_cpu();
count += 1;
return true;
} else {
return false;
}
},
[&count, this]() {
return false;
// return count == this->config.evict_count;
});
}
void CacheEntryManager::insert(BlockPtr entry) {
assert(entry->with_key);
assert(key_entry_map.count(entry->hash) == 0);
usage_list.push_front(entry);
key_entry_map[entry->hash] = usage_list.begin();
}
CacheEntryManager::BlockPtr CacheEntryManager::access(const Key& key) {
auto it = key_entry_map.at(key);
auto entry = *it;
usage_list.erase(it);
usage_list.push_front(entry);
key_entry_map[key] = usage_list.begin();
return entry;
}
// void CacheEntryManager::remove(const Key& key) {
// auto it = key_entry_map[key];
// usage_list.erase(it);
// key_entry_map.erase(key);
// }
void CacheEntryManager::evict(std::function<bool(const BlockPtr&)> filter, std::function<bool()> stop_condition) {
auto evict_count = 0;
auto inspect_count = 0;
std::lock_guard<std::mutex> lg(lock);
for (auto it = usage_list.rbegin(); it != usage_list.rend();) {
inspect_count += 1;
// SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
// usage_list.size(), evict_count, inspect_count, pool->debug());
// (*it)->debug();
if (stop_condition())
break;
auto entry_ul = (*it)->try_lock();
if (entry_ul.owns_lock() == false) {
++it; // Ensure iterator advances when locking fails
continue;
}
if (filter(*it)) {
// SPDLOG_DEBUG("Evicting {}", fmt::ptr(it->get()));
evict_count++;
if ((*it)->with_key)
key_entry_map.erase((*it)->hash);
it = decltype(it)(usage_list.erase(std::next(it).base())); // Use base() to adjust for reverse iterator
} else {
++it; // Ensure iterator advances when filter fails
}
}
if (evict_count > 0) {
SPDLOG_DEBUG("Map Size {}, List Size {}, Evicted {} blocks, Inspected {}, {}", key_entry_map.size(),
usage_list.size(), evict_count, inspect_count, pool->debug());
}
}
CacheEntryManager::BlockPtr CacheEntryManager::get(bool& is_new, size_t size, std::optional<Key> key) {
std::unique_lock<std::mutex> ul(lock);
if (key.has_value()) {
if (key_entry_map.count(key.value())) {
is_new = false;
return access(key.value());
} else {
auto entry = std::make_shared<CacheBlockEntry>();
entry->with_key = true;
entry->hash = key.value();
entry->size = size;
entry->manager = this;
insert(entry);
is_new = true;
return entry;
}
} else {
auto entry = std::make_shared<CacheBlockEntry>();
entry->with_key = false;
entry->size = size;
entry->manager = this;
is_new = true;
return entry;
}
}
void CacheEntryManager::debug() {
fmt::print("Cache Manager: {} entries\n", key_entry_map.size());
pool->debug();
fmt::print("Layer 0 Entries in Order\n", key_entry_map.size());
for (auto& it : usage_list) {
if (it->layer == 0)
it->debug();
}
}
}; // namespace kvc2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment