Commit c009512a authored by Azure-Tang's avatar Azure-Tang
Browse files

Merge branch 'main' into hip

parents c1f13a69 4f22d726
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
WORKDIR /workspace
ENV CUDA_HOME /usr/local/cuda
RUN <<EOF
apt update -y && apt install -y --no-install-recommends \
git \
wget \
vim \
gcc \
g++ \
cmake &&
rm -rf /var/lib/apt/lists/* &&
pip install --upgrade pip &&
pip install ninja pyproject numpy cpufeature &&
pip install flash-attn &&
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
EOF
# Set the default shell to bash
CMD ["/bin/bash"]
\ No newline at end of file
{
"name": "Ktrans Dev Container",
"privileged": true,
"build": {
"dockerfile": "Dockerfile",
"context": "..",
"args": {
"http_proxy": "${env:http_proxy}",
"https_proxy": "${env:https_proxy}",
}
},
"runArgs": [
"--network=host",
"--gpus",
"all"
// "--gpu all"
],
"workspaceFolder": "/workspace",
"workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
"mounts": [
"source=/mnt/data,target=/mnt/incontainer,type=bind,consistency=cached"
],
"customizations": {
"vscode": {
"extensions": [
],
"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"cmake.configureOnOpen": true,
"cmake.generator": "Ninja"
}
}
}
}
\ No newline at end of file
name: 🐞 Bug report
description: Create a report to help us reproduce and fix the bug
title: "[Bug] "
labels: ['Bug']
body:
- type: checkboxes
attributes:
label: Checklist
options:
- label: 1. I have searched related issues but cannot get the expected help.
- label: 2. The bug has not been fixed in the latest version.
- label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
- label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
- label: 5. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-Chinese/English content without translation may be closed.
- type: textarea
attributes:
label: Describe the bug
description: A clear and concise description of what the bug is.
validations:
required: true
- type: textarea
attributes:
label: Reproduction
description: |
What command or script did you run? Which **model** are you using?
placeholder: |
A placeholder for the command.
validations:
required: true
- type: textarea
attributes:
label: Environment
description: |
Please provide necessary environment information here (e.g. OS/GPU/CPU). Otherwise the issue will be close.
placeholder: Environment here.
validations:
required: true
\ No newline at end of file
name: 🐞 BUG报告
description: 创建报告以帮助我们复现并修复BUG
title: "[Bug] "
labels: ['Bug']
body:
- type: checkboxes
attributes:
label: 检查清单
options:
- label: 1. 我已经搜索过相关问题,但未能获得预期的帮助
- label: 2. 该问题在最新版本中尚未修复
- label: 3. 请注意,如果您提交的BUG相关 issue 缺少对应环境信息和最小可复现示例,我们将难以复现和定位问题,降低获得反馈的可能性
- label: 4. 如果您提出的不是bug而是问题,请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
- label: 5. 为方便社区交流,我将使用中文/英文或附上中文/英文翻译(如使用其他语言)。未附带翻译的非中文/英语内容可能会被关闭
- type: textarea
attributes:
label: 问题描述
description: 清晰简洁地描述BUG是什么
validations:
required: true
- type: textarea
attributes:
label: 复现步骤
description: |
你运行了什么命令或脚本?使用的是哪个**模型**?
placeholder: |
在此处填写命令
validations:
required: true
- type: textarea
attributes:
label: 环境信息
description: |
请提供必要的环境信息(如操作系统/GPU/CPU),否则该 issue 将被关闭
placeholder: 在此处填写环境信息
validations:
required: true
\ No newline at end of file
name: 🚀 Feature request
description: Suggest an idea for this project
title: "[Feature] "
body:
- type: checkboxes
attributes:
label: Checklist
options:
- label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
- label: 2. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-English/Chinese content without translation may be closed.
- type: textarea
attributes:
label: Motivation
description: |
A clear and concise description of the motivation of the feature.
validations:
required: true
- type: textarea
attributes:
label: Related resources
description: |
If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
\ No newline at end of file
name: 🚀 新功能请求
description: 为项目提出新功能建议
title: "[Feature] "
body:
- type: checkboxes
attributes:
label: 检查清单
options:
- label: 1. 如果您提出的不是新功能而是问题,请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
- label: 2. 为方便社区交流,我将使用中文/英文或附上英文/中文翻译(如使用其他语言)。未附带翻译的非英文/中文内容可能会被关闭
- type: textarea
attributes:
label: 需求背景
description: |
清晰简洁地描述该功能的背景需求
validations:
required: true
- type: textarea
attributes:
label: 相关资源
description: |
如果有官方代码实现或第三方实现,请在此提供相关信息,这将非常有帮助
\ No newline at end of file
name: Book-CI
on:
push:
branches:
- main
# - server_support
pull_request:
branches:
- main
# - server_support
jobs:
test:
name: test
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- uses: actions/checkout@v4
- name: Install Rust
run: |
rustup set profile minimal
rustup toolchain install stable
rustup default stable
- name: Setup mdBook
uses: peaceiris/actions-mdbook@v2
with:
mdbook-version: "latest"
# - name: Run tests
# run: mdbook test
\ No newline at end of file
name: Deploy
on:
push:
branches:
- main
# - server_support
pull_request:
branches:
- main
# - server_support
defaults:
run:
shell: bash
permissions:
contents: write
jobs:
deploy:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- uses: actions/checkout@v4
- name: Install Rust
run: |
rustup set profile minimal
rustup toolchain install stable
rustup default stable
- name: Setup mdBook
uses: peaceiris/actions-mdbook@v2
with:
mdbook-version: "latest"
- run: mdbook build
# - name: Copy Assets
# run: |
# chmod +x ci/copy-assets.sh
# ci/copy-assets.sh ${{ matrix.os }}
- name: Deploy
uses: peaceiris/actions-gh-pages@v3
# or || github.ref == 'refs/heads/server_support'
if: ${{ github.ref == 'refs/heads/main' }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./book
\ No newline at end of file
name: DockerHub CI
on:
release:
types: [published]
workflow_dispatch:
inputs:
choose:
description: 'Will you push the image to DockerHub? 0 for No, 1 for Yes'
required: true
default: '0'
type: string
# push:
# branches:
# - main
env:
DOCKERHUB_REPO: ${{ secrets.DOCKERHUB_USERNAME }}/ktransformers
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run tests
run: |
if [ -f docker-compose.test.yml ]; then
docker-compose --file docker-compose.test.yml build
docker-compose --file docker-compose.test.yml run sut
else
docker build . --file Dockerfile
fi
docker_task:
needs: test
name: ${{ matrix.instruct}}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
# for amd64
- {instruct: "FANCY", platform: "linux/amd64"}
- {instruct: "AVX512", platform: "linux/amd64"}
- {instruct: "AVX2", platform: "linux/amd64"}
- {instruct: "NATIVE", platform: "linux/amd64"}
# for arm64
- {instruct: "NATIVE", platform: "linux/arm64"}
steps:
- name: Move Docker data directory
run: |
sudo systemctl stop docker
sudo mkdir -p /mnt/docker
sudo rsync -avz /var/lib/docker/ /mnt/docker
sudo rm -rf /var/lib/docker
sudo ln -s /mnt/docker /var/lib/docker
sudo systemctl start docker
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
-
name: Build and push for amd64
if: matrix.platform == 'linux/amd64'
uses: docker/build-push-action@v6
with:
push: true
platforms: |
linux/amd64
tags: |
${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
build-args: |
CPU_INSTRUCT=${{ matrix.instruct }}
-
name: Build and push for arm64
if: matrix.platform == 'linux/arm64'
uses: docker/build-push-action@v6
with:
push: true
platforms: |
linux/arm64
tags: |
${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
build-args: |
CPU_INSTRUCT=${{ matrix.instruct }}
\ No newline at end of file
name: Install KTransformers
run-name: Install KTransformers
on:
workflow_dispatch:
inputs:
job_to_run:
description: "Which job to run?"
required: true
default: "install"
type: choice
options:
- create&install
- install
jobs:
Install-KTransformers:
runs-on: self-hosted
steps:
- run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
- run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
- name: Check out repository code
uses: actions/checkout@v4
- run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
- name: Remove old conda environment
continue-on-error: true
if: ${{ inputs.job_to_run == 'create&install'}}
run: |
source /home/qujing3/anaconda3/etc/profile.d/conda.sh
conda env remove --name ktransformers-dev -y
- name: Create conda environment
if: ${{ inputs.job_to_run == 'create&install'}}
run: |
source /home/qujing3/anaconda3/etc/profile.d/conda.sh
conda create --name ktransformers-dev python=3.11
conda activate ktransformers-dev
conda install -c conda-forge libstdcxx-ng -y
- name: Install dependencies
if: ${{ inputs.job_to_run == 'create&install'}}
run: |
source /home/qujing3/anaconda3/etc/profile.d/conda.sh
conda activate ktransformers-dev
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
pip3 install packaging ninja cpufeature numpy
pip install ~/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
- name: Install KTransformers
run: |
source /home/qujing3/anaconda3/etc/profile.d/conda.sh
conda activate ktransformers-dev
cd ${{ github.workspace }}
git submodule init
git submodule update
USE_NUMA=1 bash install.sh
- run: echo "This job's status is ${{ job.status }}."
...@@ -19,6 +19,10 @@ ktransformers/server/local_store/ ...@@ -19,6 +19,10 @@ ktransformers/server/local_store/
ktransformers/server_test1.db ktransformers/server_test1.db
*.patch *.patch
img/ img/
tmp1.txt tmp*.txt
test_65_300_1536.txt
test.txt test.txt
book
ktransformers/tests/chat_txt.txt
mmlu_result*
ktransformers/ktransformers_ext/cuda_musa/
test_prompt.txt
...@@ -10,7 +10,8 @@ EOF ...@@ -10,7 +10,8 @@ EOF
FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel as compile_server FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
ARG CPU_INSTRUCT=NATIVE
WORKDIR /workspace WORKDIR /workspace
ENV CUDA_HOME /usr/local/cuda ENV CUDA_HOME /usr/local/cuda
COPY --from=web_compile /home/ktransformers /workspace/ktransformers COPY --from=web_compile /home/ktransformers /workspace/ktransformers
...@@ -26,10 +27,12 @@ rm -rf /var/lib/apt/lists/* && ...@@ -26,10 +27,12 @@ rm -rf /var/lib/apt/lists/* &&
cd ktransformers && cd ktransformers &&
git submodule init && git submodule init &&
git submodule update && git submodule update &&
pip install --upgrade pip &&
pip install ninja pyproject numpy cpufeature && pip install ninja pyproject numpy cpufeature &&
pip install flash-attn && pip install flash-attn &&
CPU_INSTRUCT=NATIVE KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose && CPU_INSTRUCT=${CPU_INSTRUCT} KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
pip cache purge pip cache purge &&
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
EOF EOF
ENTRYPOINT [ "/opt/conda/bin/ktransformers" ] ENTRYPOINT ["tail", "-f", "/dev/null"]
\ No newline at end of file \ No newline at end of file
...@@ -18,4 +18,15 @@ dev_install: ...@@ -18,4 +18,15 @@ dev_install:
echo "Installing ktransformers" echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
echo "Installation completed successfully" echo "Installation completed successfully"
\ No newline at end of file clean:
rm -rf build
rm -rf *.egg-info
rm -rf ktransformers/ktransformers_ext/build
rm -rf ktransformers/ktransformers_ext/cuda/build
rm -rf ktransformers/ktransformers_ext/cuda/dist
rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
install_numa:
USE_NUMA=1 make dev_install
install_no_numa:
env -u USE_NUMA make dev_install
...@@ -23,14 +23,16 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin ...@@ -23,14 +23,16 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
<h2 id="Updates">🔥 Updates</h2> <h2 id="Updates">🔥 Updates</h2>
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md). * **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). * **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed (+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G. * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. * **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
* **Aug 14, 2024**: Support llamfile as linear backend. * **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu. * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
* **Aug 9, 2024**: Support windows native. * **Aug 9, 2024**: Support windows native.
<!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
<h2 id="show-cases">🌟 Show Cases</h2> <h2 id="show-cases">🌟 Show Cases</h2>
<div> <div>
...@@ -43,10 +45,10 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285 ...@@ -43,10 +45,10 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
- **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)). - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
- Prefill Speed (tokens/s): - Prefill Speed (tokens/s):
- KTransfermor: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only) - KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**. - Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
- Decode Speed (tokens/s): - Decode Speed (tokens/s):
- KTransfermor: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only) - KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**. - Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.
- Upcoming Open Source Release: - Upcoming Open Source Release:
- AMX optimizations and selective expert activation will be open-sourced in V0.3. - AMX optimizations and selective expert activation will be open-sourced in V0.3.
...@@ -69,7 +71,7 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c ...@@ -69,7 +71,7 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
</p> </p>
<h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3> <!-- <h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
<p align="center"> <p align="center">
https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
...@@ -91,228 +93,20 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 ...@@ -91,228 +93,20 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
* **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp. * **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.
* **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md). * **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
-->
<strong>More advanced features will coming soon, so stay tuned!</strong> <strong>More advanced features will coming soon, so stay tuned!</strong>
<h2 id="quick-start">🚀 Quick Start</h2> <h2 id="quick-start">🚀 Quick Start</h2>
<h3>Preparation</h3>
Some preparation:
- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
```sh
# Adding CUDA to PATH
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export CUDA_PATH=/usr/local/cuda
```
- Linux-x86_64 with gcc, g++ and cmake
```sh
sudo apt-get update
sudo apt-get install gcc g++ cmake ninja-build
```
- We recommend using [Conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program.
```sh
conda create --name ktransformers python=3.11
conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
```
- Make sure that PyTorch, packaging, ninja is installed
```
pip install torch packaging ninja cpufeature numpy
```
<h3>Installation</h3>
1. Use a Docker image, see [documentation for Docker](./doc/en/Docker.md)
2. You can install using Pypi (for linux):
```
pip install ktransformers --no-build-isolation
```
for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced.
3. Or you can download source code and compile:
- init source code
```sh
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule init
git submodule update
```
- [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
- Compile and install (for Linux)
```
bash install.sh
```
- Compile and install(for Windows)
```
install.bat
```
4. If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./doc/en/makefile_usage.md)
<h3>Local Chat</h3>
We provide a simple command-line local chat Python script that you can run for testing.
> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test.
<h4>Run Example</h4>
```shell
# Begin from root of your cloned repo!
# Begin from root of your cloned repo!!
# Begin from root of your cloned repo!!!
# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
mkdir DeepSeek-V2-Lite-Chat-GGUF
cd DeepSeek-V2-Lite-Chat-GGUF
wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepSeek-V2-Lite-Chat.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf
cd .. # Move to repo's root dir
# Start local chat
python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try:
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
# python ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
```
It features the following arguments:
- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files you may directly use that path to initialize the model.
> Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
- `--optimize_rule_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
<h3 id="suggested-model"> Suggested Model</h3>
| Model Name | Model Size | VRAM | Minimum DRAM | Recommended DRAM |
| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
| DeepSeek-R1-q4_k_m | 377G | 14G | 382G | 512G |
| DeepSeek-V3-q4_k_m | 377G | 14G | 382G | 512G |
| DeepSeek-V2-q4_k_m | 133G | 11G | 136G | 192G |
| DeepSeek-V2.5-q4_k_m | 133G | 11G | 136G | 192G |
| DeepSeek-V2.5-IQ4_XS | 117G | 10G | 107G | 128G |
| Qwen2-57B-A14B-Instruct-q4_k_m | 33G | 8G | 34G | 64G |
| DeepSeek-V2-Lite-q4_k_m | 9.7G | 3G | 13G | 16G |
| Mixtral-8x7B-q4_k_m | 25G | 1.6G | 51G | 64G |
| Mixtral-8x22B-q4_k_m | 80G | 4G | 86.1G | 96G |
| InternLM2.5-7B-Chat-1M | 15.5G | 15.5G | 8G(32K context) | 150G (1M context) |
More will come soon. Please let us know which models you are most interested in.
Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
<details>
<summary>Click To Show how to run other examples</summary>
* Qwen2-57B
```sh
pip install flash_attn # For Qwen2
mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF
wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf
cd ..
python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try:
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
# python ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
```
* DeepseekV2
```sh
mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
# Download weights
wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf
cd .. Getting started with KTransformers is simple! Follow the steps below to set up and start using it.
python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF ### 📥 Installation
# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try: To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
# python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
```
| model name | weights download link |
|----------|----------|
| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
</details>
<!-- pin block for jump -->
<span id='id_666'>
<h3>RESTful API and Web UI</h3>
Start without website:
```sh
ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
```
Start with website:
```sh
ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002 --web True
```
Or you want to start server with transformers, the model_path should include safetensors
```bash
ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
```
Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :
<p align="center">
<picture>
<img alt="Web UI" src="https://github.com/user-attachments/assets/615dca9b-a08c-4183-bbd3-ad1362680faf" width=90%>
</picture>
</p>
More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).
<h2 id="tutorial">📃 Brief Injection Tutorial</h2> <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
At the heart of KTransformers is a user-friendly, template-based injection framework. At the heart of KTransformers is a user-friendly, template-based injection framework.
...@@ -333,7 +127,7 @@ To utilize the provided kernels, users only need to create a YAML-based injectio ...@@ -333,7 +127,7 @@ To utilize the provided kernels, users only need to create a YAML-based injectio
```python ```python
with torch.device("meta"): with torch.device("meta"):
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True) model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config) optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
... ...
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000) generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
``` ```
...@@ -368,14 +162,14 @@ If you are interested in our design principles and the implementation of the inj ...@@ -368,14 +162,14 @@ If you are interested in our design principles and the implementation of the inj
<h2 id="ack">Acknowledgment and Contributors</h2> <h2 id="ack">Acknowledgment and Contributors</h2>
The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, and Marlin. We are planning to contribute back to the community by upstreaming our modifications. The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, Marlin, sglang and flashinfer. We are planning to contribute back to the community by upstreaming our modifications.
KTransformer is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformer faster and easier to use. KTransformer is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformer faster and easier to use.
<h2 id="ack">Discussion</h2> <h2 id="ack">Discussion</h2>
If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGrouop.jpg) If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)
<h2 id="FAQ">🙋 FAQ</h2> <h2 id="FAQ">🙋 FAQ</h2>
......
<div align="center">
<!-- <h1>KTransformers</h1> -->
<p align="center">
<picture>
<img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
</picture>
</p>
<h3>一个用于体验尖端 LLM 推理优化的灵活框架</h3>
<strong><a href="#show-cases">🌟 案例展示</a> | <a href="#quick-start">🚀 快速入门</a> | <a href="#tutorial">📃 教程</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬 讨论</a> | <a href="#FAQ">🙋 常见问题</a> </strong>
</div>
<h2 id="intro">🎉 介绍</h2>
KTransformers(发音为 Quick Transformers)旨在通过先进的内核优化和放置/并行策略来增强您对 🤗 [Transformers](https://github.com/huggingface/transformers) 的体验。
<br/><br/>
KTransformers 是一个以 Python 为中心的灵活框架,其核心是可扩展性。通过用一行代码实现并注入优化模块,用户可以获得与 Transformers 兼容的接口、符合 OpenAI 和 Ollama 的 RESTful API,甚至是一个简化的类似 ChatGPT 的 Web 界面。
<br/><br/>
我们对 KTransformers 的愿景是成为一个用于实验创新 LLM 推理优化的灵活平台。如果您需要任何其他功能,请告诉我们。
<h2 id="Updates">🔥 更新</h2>
* **2025 年 2 月 15 日**:为DeepSeek-V3/R1支持[FP8 GPU内核](./doc/en/fp8_kernel.md); 支持更长的上下文([教程](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context)).
* **2025 年 2 月 15 日**:长上下文(从4K到8K,24GB VRAM) & 稍快的速度(+15%)(最快 16 Tokens/s),文档请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)[在线指南](https://kvcache-ai.github.io/ktransformers/)
* **2025 年 2 月 10 日**:支持 Deepseek-R1 和 V3 在单个(24GB VRAM)/多 GPU 和 382G DRAM 上运行,速度提升高达 3~28 倍。详细教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)
* **2024 年 8 月 28 日**:支持 InternLM2.5-7B-Chat-1M 模型下的 1M 上下文,使用 24GB 的 VRAM 和 150GB 的 DRAM。详细教程请参见 [这里](./doc/en/long_context_tutorial.md)
* **2024 年 8 月 28 日**:将 DeepseekV2 所需的 VRAM 从 21G 降低到 11G。
* **2024 年 8 月 15 日**:更新了详细的 [教程](doc/en/injection_tutorial.md),介绍注入和多 GPU 的使用。
* **2024 年 8 月 14 日**:支持 llamfile 作为线性后端。
* **2024 年 8 月 12 日**:支持多 GPU;支持新模型:mixtral 8\*7B 和 8\*22B;支持 q2k、q3k、q5k 在 GPU 上的去量化。
* **2024 年 8 月 9 日**:支持 Windows。
<h2 id="show-cases">🌟 案例展示</h2>
<div>
<h3>在仅 24GB VRAM 的桌面上运行 GPT-4/o1 级别的本地 VSCode Copilot</h3>
</div>
https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
</p>
- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1**:使用其 Q4_K_M 版本,仅需 14GB VRAM 和 382GB DRAM 即可运行(教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md))。
- 预填充速度(tokens/s):
- KTransformers:54.21(32 核)→ 74.362(双插槽,2×32 核)→ 255.26(优化的 AMX 基 MoE 内核,仅 V0.3)→ 286.55(选择性使用 6 个专家,仅 V0.3)
- 与 llama.cpp 在 2×32 核下相比,达到 **27.79× 速度提升**
- 解码速度(tokens/s):
- KTransformers:8.73(32 核)→ 11.26(双插槽,2×32 核)→ 13.69(选择性使用 6 个专家,仅 V0.3)
- 与 llama.cpp 在 2×32 核下相比,达到 **3.03× 速度提升**
- 即将开源发布:
- AMX 优化和选择性专家激活将在 V0.3 中开源。
- 目前仅在预览二进制分发中可用,可从 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 下载。
- **本地 236B DeepSeek-Coder-V2**:使用其 Q4_K_M 版本,仅需 21GB VRAM 和 136GB DRAM 即可运行,甚至在 [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench) 中得分超过 GPT4-0613。
<p align="center">
<picture>
<img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/d052924e-8631-44de-aad2-97c54b965693" width=100%>
</picture>
</p>
- **更快的速度**:通过 MoE 卸载和注入来自 [Llamafile](https://github.com/Mozilla-Ocho/llamafile/tree/main)[Marlin](https://github.com/IST-DASLab/marlin) 的高级内核,实现了 2K 提示预填充 126 tokens/s 和生成 13.6 tokens/s 的速度。
- **VSCode 集成**:封装成符合 OpenAI 和 Ollama 的 API,可无缝集成到 [Tabby](https://github.com/TabbyML/tabby) 和其他前端的后端。
<p align="center">
https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
</p>
<!-- <h3>在仅 24GB VRAM 的桌面上进行 1M 上下文本地推理</h3>
<p align="center"> -->
<!-- https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 -->
<!--
* **1M 上下文 InternLM 2.5 7B**:以全 bf16 精度运行,使用 24GB VRAM 和 150GB DRAM,可在本地桌面设置中实现。在 1M "针在干草堆中" 测试中达到 92.88% 的成功率,在 128K NIAH 测试中达到 100%。
<p align="center">
<picture>
<img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
</picture>
</p>
<p align="center">
<picture>
<img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
</picture>
</p>
* **增强的速度**:使用稀疏注意力,通过 llamafile 内核实现 1M 上下文生成 16.91 tokens/s 的速度。这种方法比 llama.cpp 的全注意力方法快 10 倍以上。
* **灵活的稀疏注意力框架**:提供了一个灵活的块稀疏注意力框架,用于 CPU 卸载解码。与 SnapKV、Quest 和 InfLLm 兼容。更多信息请参见 [这里](./doc/en/long_context_introduction.md)。 -->
<strong>更多高级功能即将推出,敬请期待!</strong>
<h2 id="quick-start">🚀 快速入门</h2>
KTransformers 的入门非常简单!请参考我们的[安装指南]((https://kvcache-ai.github.io/ktransformers/))进行安装。
<h2 id="tutorial">📃 简要注入教程</h2>
KTransformers 的核心是一个用户友好的、基于模板的注入框架。这使得研究人员可以轻松地将原始 torch 模块替换为优化的变体。它还简化了多种优化的组合过程,允许探索它们的协同效应。
</br>
<p align="center">
<picture>
<img alt="Inject-Struction" src="https://github.com/user-attachments/assets/6b4c1e54-9f6d-45c5-a3fc-8fa45e7d257e" width=65%>
</picture>
</p>
鉴于 vLLM 已经是一个用于大规模部署优化的优秀框架,KTransformers 特别关注受资源限制的本地部署。我们特别关注异构计算时机,例如量化模型的 GPU/CPU 卸载。例如,我们支持高效的 <a herf="https://github.com/Mozilla-Ocho/llamafile/tree/main">Llamafile</a><a herf="https://github.com/IST-DASLab/marlin">Marlin</a> 内核,分别用于 CPU 和 GPU。 更多详细信息可以在 <a herf="doc/en/operators/llamafile.md">这里</a>找到。
<h3>示例用法</h3>
要使用提供的内核,用户只需创建一个基于 YAML 的注入模板,并在使用 Transformers 模型之前添加对 `optimize_and_load_gguf` 的调用。
```python
with torch.device("meta"):
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
...
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
```
在这个示例中,首先在 meta 设备上初始化 AutoModel,以避免占用任何内存资源。然后,`optimize_and_load_gguf` 遍历模型的所有子模块,匹配您的 YAML 规则文件中指定的规则,并将它们替换为指定的高级模块。
注入后,原始的 `generate` 接口仍然可用,但我们还提供了一个兼容的 `prefill_and_generate` 方法,这使得可以进一步优化,例如使用 CUDAGraph 提高生成速度。
<h3>如何自定义您的模型</h3>
一个详细的使用 DeepSeek-V2 作为示例的注入和 multi-GPU 教程在 [这里](doc/en/injection_tutorial.md)
以下是一个将所有原始 Linear 模块替换为 Marlin 的 YAML 模板示例,Marlin 是一个高级的 4 位量化内核。
```yaml
- match:
name: "^model\\.layers\\..*$" # 正则表达式
class: torch.nn.Linear # 仅匹配同时符合名称和类的模块
replace:
class: ktransformers.operators.linear.KTransformerLinear # 量化数据类型的优化内核
device: "cpu" # 初始化时加载该模块的 device
kwargs:
generate_device: "cuda"
generate_linear_type: "QuantizedLinearMarlin"
```
YAML 文件中的每个规则都有两部分:`match``replace``match` 部分指定应替换的模块,`replace` 部分指定要注入到模型中的模块以及初始化关键字。
您可以在 [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) 目录中找到用于优化 DeepSeek-V2 和 Qwen2-57B-A14 的示例规则模板。这些模板用于为 `local_chat.py` 示例提供支持。
如果您对我们的设计原则和注入框架的实现感兴趣,请参考 [设计文档](doc/en/deepseek-v2-injection.md)
<h2 id="ack">致谢和贡献者</h2>
KTransformer 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile 、 Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。
KTransformer 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们,使 KTransformer 更快、更易于使用。
<h2 id="ack">讨论</h2>
如果您有任何问题,欢迎随时提出 issue。或者,您可以加入我们的微信群进行进一步讨论。二维码: [微信群](WeChatGroup.png)
<h2 id="FAQ">🙋 常见问题</h2>
一些常见问题的答案可以在 [FAQ](doc/en/FAQ.md) 中找到。
[book]
authors = ["kvcache-ai"]
language = "zh-CN"
title = "Ktransformers"
src = "doc"
[output.html]
git-repository-url = "https://github.com/kvcache-ai/ktransformers"
edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"
[output.html.playground]
editable = true
copy-js = true
# line-numbers = true
[output.html.fold]
enable = true
level = 0
\ No newline at end of file
<div align="center">
<!-- <h1>KTransformers</h1> -->
<p align="center">
<picture>
<img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
</picture>
</p>
</div>
<h2 id="intro">🎉 Introduction</h2>
KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
<br/><br/>
KTransformers is a flexible, Python-centric framework designed with extensibility at its core.
By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI.
<br/><br/>
Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
<h2 id="Updates">🔥 Updates</h2>
* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU.
* **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
* **Aug 9, 2024**: Support windows native.
# Ktransformer
[Introduction](./README.md)
# Install
- [Installation Guide](en/install.md)
# Tutorial
- [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
- [Why KTransformers So Fast](en/deepseek-v2-injection.md)
- [Injection Tutorial](en/injection_tutorial.md)
- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
- [Use FP8 GPU Kernel](en/fp8_kernel.md)
# Server
- [Server](en/api/server/server.md)
- [Website](en/api/server/website.md)
- [Tabby](en/api/server/tabby.md)
# For Developer
- [Makefile Usage](en/makefile_usage.md)
# FAQ
- [FAQ](en/FAQ.md)
# V3 Reproduction
- [Success List](en/V3-success.md)
# Benchmark
- [Benchmark](en/benchmark.md)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment