Unverified Commit ca1dc1e7 authored by Atream's avatar Atream Committed by GitHub
Browse files

Merge branch 'main' into main

parents d3b45d57 505f4e2c
FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
WORKDIR /workspace
ENV CUDA_HOME /usr/local/cuda
RUN <<EOF
apt update -y && apt install -y --no-install-recommends \
git \
wget \
vim \
gcc \
g++ \
cmake &&
rm -rf /var/lib/apt/lists/* &&
pip install --upgrade pip &&
pip install ninja pyproject numpy cpufeature &&
pip install flash-attn &&
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
EOF
# Set the default shell to bash
CMD ["/bin/bash"]
\ No newline at end of file
{
"name": "Ktrans Dev Container",
"privileged": true,
"build": {
"dockerfile": "Dockerfile",
"context": "..",
"args": {
"http_proxy": "${env:http_proxy}",
"https_proxy": "${env:https_proxy}",
}
},
"runArgs": [
"--network=host",
"--gpus",
"all"
// "--gpu all"
],
"workspaceFolder": "/workspace",
"workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
"mounts": [
"source=/mnt/data,target=/mnt/incontainer,type=bind,consistency=cached"
],
"customizations": {
"vscode": {
"extensions": [
],
"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"cmake.configureOnOpen": true,
"cmake.generator": "Ninja"
}
}
}
}
\ No newline at end of file
name: 🐞 Bug report
description: Create a report to help us reproduce and fix the bug
title: "[Bug] "
labels: ['Bug']
body:
- type: checkboxes
attributes:
label: Checklist
options:
- label: 1. I have searched related issues but cannot get the expected help.
- label: 2. The bug has not been fixed in the latest version.
- label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
- label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
- label: 5. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-Chinese/English content without translation may be closed.
- type: textarea
attributes:
label: Describe the bug
description: A clear and concise description of what the bug is.
validations:
required: true
- type: textarea
attributes:
label: Reproduction
description: |
What command or script did you run? Which **model** are you using?
placeholder: |
A placeholder for the command.
validations:
required: true
- type: textarea
attributes:
label: Environment
description: |
Please provide necessary environment information here (e.g. OS/GPU/CPU). Otherwise the issue will be close.
placeholder: Environment here.
validations:
required: true
\ No newline at end of file
name: 🐞 BUG报告
description: 创建报告以帮助我们复现并修复BUG
title: "[Bug] "
labels: ['Bug']
body:
- type: checkboxes
attributes:
label: 检查清单
options:
- label: 1. 我已经搜索过相关问题,但未能获得预期的帮助
- label: 2. 该问题在最新版本中尚未修复
- label: 3. 请注意,如果您提交的BUG相关 issue 缺少对应环境信息和最小可复现示例,我们将难以复现和定位问题,降低获得反馈的可能性
- label: 4. 如果您提出的不是bug而是问题,请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
- label: 5. 为方便社区交流,我将使用中文/英文或附上中文/英文翻译(如使用其他语言)。未附带翻译的非中文/英语内容可能会被关闭
- type: textarea
attributes:
label: 问题描述
description: 清晰简洁地描述BUG是什么
validations:
required: true
- type: textarea
attributes:
label: 复现步骤
description: |
你运行了什么命令或脚本?使用的是哪个**模型**?
placeholder: |
在此处填写命令
validations:
required: true
- type: textarea
attributes:
label: 环境信息
description: |
请提供必要的环境信息(如操作系统/GPU/CPU),否则该 issue 将被关闭
placeholder: 在此处填写环境信息
validations:
required: true
\ No newline at end of file
name: 🚀 Feature request
description: Suggest an idea for this project
title: "[Feature] "
body:
- type: checkboxes
attributes:
label: Checklist
options:
- label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
- label: 2. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-English/Chinese content without translation may be closed.
- type: textarea
attributes:
label: Motivation
description: |
A clear and concise description of the motivation of the feature.
validations:
required: true
- type: textarea
attributes:
label: Related resources
description: |
If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
\ No newline at end of file
name: 🚀 新功能请求
description: 为项目提出新功能建议
title: "[Feature] "
body:
- type: checkboxes
attributes:
label: 检查清单
options:
- label: 1. 如果您提出的不是新功能而是问题,请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
- label: 2. 为方便社区交流,我将使用中文/英文或附上英文/中文翻译(如使用其他语言)。未附带翻译的非英文/中文内容可能会被关闭
- type: textarea
attributes:
label: 需求背景
description: |
清晰简洁地描述该功能的背景需求
validations:
required: true
- type: textarea
attributes:
label: 相关资源
description: |
如果有官方代码实现或第三方实现,请在此提供相关信息,这将非常有帮助
\ No newline at end of file
name: DockerHub CI
on:
release:
types: [published]
workflow_dispatch:
inputs:
choose:
description: 'Will you push the image to DockerHub? 0 for No, 1 for Yes'
required: true
default: '0'
type: string
# push:
# branches:
# - main
env:
DOCKERHUB_REPO: ${{ secrets.DOCKERHUB_USERNAME }}/ktransformers
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run tests
run: |
if [ -f docker-compose.test.yml ]; then
docker-compose --file docker-compose.test.yml build
docker-compose --file docker-compose.test.yml run sut
else
docker build . --file Dockerfile
fi
docker_task:
needs: test
name: ${{ matrix.instruct}}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
# for amd64
- {instruct: "FANCY", platform: "linux/amd64"}
- {instruct: "AVX512", platform: "linux/amd64"}
- {instruct: "AVX2", platform: "linux/amd64"}
- {instruct: "NATIVE", platform: "linux/amd64"}
# for arm64
- {instruct: "NATIVE", platform: "linux/arm64"}
steps:
- name: Move Docker data directory
run: |
sudo systemctl stop docker
sudo mkdir -p /mnt/docker
sudo rsync -avz /var/lib/docker/ /mnt/docker
sudo rm -rf /var/lib/docker
sudo ln -s /mnt/docker /var/lib/docker
sudo systemctl start docker
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
-
name: Build and push for amd64
if: matrix.platform == 'linux/amd64'
uses: docker/build-push-action@v6
with:
push: true
platforms: |
linux/amd64
tags: |
${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
build-args: |
CPU_INSTRUCT=${{ matrix.instruct }}
-
name: Build and push for arm64
if: matrix.platform == 'linux/arm64'
uses: docker/build-push-action@v6
with:
push: true
platforms: |
linux/arm64
tags: |
${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
build-args: |
CPU_INSTRUCT=${{ matrix.instruct }}
\ No newline at end of file
...@@ -19,7 +19,10 @@ ktransformers/server/local_store/ ...@@ -19,7 +19,10 @@ ktransformers/server/local_store/
ktransformers/server_test1.db ktransformers/server_test1.db
*.patch *.patch
img/ img/
tmp1.txt tmp*.txt
test_65_300_1536.txt
test.txt test.txt
book book
ktransformers/tests/chat_txt.txt
mmlu_result*
ktransformers/ktransformers_ext/cuda_musa/
test_prompt.txt
...@@ -10,7 +10,8 @@ EOF ...@@ -10,7 +10,8 @@ EOF
FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel as compile_server FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
ARG CPU_INSTRUCT=NATIVE
WORKDIR /workspace WORKDIR /workspace
ENV CUDA_HOME /usr/local/cuda ENV CUDA_HOME /usr/local/cuda
COPY --from=web_compile /home/ktransformers /workspace/ktransformers COPY --from=web_compile /home/ktransformers /workspace/ktransformers
...@@ -26,10 +27,12 @@ rm -rf /var/lib/apt/lists/* && ...@@ -26,10 +27,12 @@ rm -rf /var/lib/apt/lists/* &&
cd ktransformers && cd ktransformers &&
git submodule init && git submodule init &&
git submodule update && git submodule update &&
pip install --upgrade pip &&
pip install ninja pyproject numpy cpufeature && pip install ninja pyproject numpy cpufeature &&
pip install flash-attn && pip install flash-attn &&
CPU_INSTRUCT=NATIVE KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose && CPU_INSTRUCT=${CPU_INSTRUCT} KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
pip cache purge pip cache purge &&
cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
EOF EOF
ENTRYPOINT ["tail", "-f", "/dev/null"] ENTRYPOINT ["tail", "-f", "/dev/null"]
\ No newline at end of file
...@@ -18,4 +18,8 @@ dev_install: ...@@ -18,4 +18,8 @@ dev_install:
echo "Installing ktransformers" echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
echo "Installation completed successfully" echo "Installation completed successfully"
\ No newline at end of file install_numa:
USE_NUMA=1 make dev_install
install_no_numa:
env -u USE_NUMA make dev_install
...@@ -23,7 +23,8 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin ...@@ -23,7 +23,8 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
<h2 id="Updates">🔥 Updates</h2> <h2 id="Updates">🔥 Updates</h2>
* **Feb 15, 2025**: KTransformers V0.2.1: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed (+15%) (Up to 16 Tokens/s), update docs [here](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/). * **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed (+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md). * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G. * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU. * **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU.
...@@ -103,7 +104,7 @@ Getting started with KTransformers is simple! Follow the steps below to set up a ...@@ -103,7 +104,7 @@ Getting started with KTransformers is simple! Follow the steps below to set up a
### 📥 Installation ### 📥 Installation
To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/). To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
<h2 id="tutorial">📃 Brief Injection Tutorial</h2> <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
...@@ -125,7 +126,7 @@ To utilize the provided kernels, users only need to create a YAML-based injectio ...@@ -125,7 +126,7 @@ To utilize the provided kernels, users only need to create a YAML-based injectio
```python ```python
with torch.device("meta"): with torch.device("meta"):
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True) model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config) optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
... ...
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000) generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
``` ```
......
...@@ -21,6 +21,8 @@ KTransformers 是一个以 Python 为中心的灵活框架,其核心是可扩 ...@@ -21,6 +21,8 @@ KTransformers 是一个以 Python 为中心的灵活框架,其核心是可扩
<h2 id="Updates">🔥 更新</h2> <h2 id="Updates">🔥 更新</h2>
* **2025 年 2 月 15 日**:为DeepSeek-V3/R1支持[FP8 GPU内核](./doc/en/fp8_kernel.md); 支持更长的上下文([教程](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context)).
* **2025 年 2 月 15 日**:长上下文(从4K到8K,24GB VRAM) & 稍快的速度(+15%)(最快 16 Tokens/s),文档请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)[在线指南](https://kvcache-ai.github.io/ktransformers/)
* **2025 年 2 月 10 日**:支持 Deepseek-R1 和 V3 在单个(24GB VRAM)/多 GPU 和 382G DRAM 上运行,速度提升高达 3~28 倍。详细教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md) * **2025 年 2 月 10 日**:支持 Deepseek-R1 和 V3 在单个(24GB VRAM)/多 GPU 和 382G DRAM 上运行,速度提升高达 3~28 倍。详细教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)
* **2024 年 8 月 28 日**:支持 InternLM2.5-7B-Chat-1M 模型下的 1M 上下文,使用 24GB 的 VRAM 和 150GB 的 DRAM。详细教程请参见 [这里](./doc/en/long_context_tutorial.md) * **2024 年 8 月 28 日**:支持 InternLM2.5-7B-Chat-1M 模型下的 1M 上下文,使用 24GB 的 VRAM 和 150GB 的 DRAM。详细教程请参见 [这里](./doc/en/long_context_tutorial.md)
* **2024 年 8 月 28 日**:将 DeepseekV2 所需的 VRAM 从 21G 降低到 11G。 * **2024 年 8 月 28 日**:将 DeepseekV2 所需的 VRAM 从 21G 降低到 11G。
...@@ -67,11 +69,11 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c ...@@ -67,11 +69,11 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
</p> </p>
<h3>在仅 24GB VRAM 的桌面上进行 1M 上下文本地推理</h3> <!-- <h3>在仅 24GB VRAM 的桌面上进行 1M 上下文本地推理</h3>
<p align="center"> <p align="center"> -->
https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
<!-- https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 -->
<!--
* **1M 上下文 InternLM 2.5 7B**:以全 bf16 精度运行,使用 24GB VRAM 和 150GB DRAM,可在本地桌面设置中实现。在 1M "针在干草堆中" 测试中达到 92.88% 的成功率,在 128K NIAH 测试中达到 100%。 * **1M 上下文 InternLM 2.5 7B**:以全 bf16 精度运行,使用 24GB VRAM 和 150GB DRAM,可在本地桌面设置中实现。在 1M "针在干草堆中" 测试中达到 92.88% 的成功率,在 128K NIAH 测试中达到 100%。
<p align="center"> <p align="center">
...@@ -88,7 +90,7 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 ...@@ -88,7 +90,7 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
* **增强的速度**:使用稀疏注意力,通过 llamafile 内核实现 1M 上下文生成 16.91 tokens/s 的速度。这种方法比 llama.cpp 的全注意力方法快 10 倍以上。 * **增强的速度**:使用稀疏注意力,通过 llamafile 内核实现 1M 上下文生成 16.91 tokens/s 的速度。这种方法比 llama.cpp 的全注意力方法快 10 倍以上。
* **灵活的稀疏注意力框架**:提供了一个灵活的块稀疏注意力框架,用于 CPU 卸载解码。与 SnapKV、Quest 和 InfLLm 兼容。更多信息请参见 [这里](./doc/en/long_context_introduction.md) * **灵活的稀疏注意力框架**:提供了一个灵活的块稀疏注意力框架,用于 CPU 卸载解码。与 SnapKV、Quest 和 InfLLm 兼容。更多信息请参见 [这里](./doc/en/long_context_introduction.md) -->
<strong>更多高级功能即将推出,敬请期待!</strong> <strong>更多高级功能即将推出,敬请期待!</strong>
...@@ -115,7 +117,7 @@ KTransformers 的核心是一个用户友好的、基于模板的注入框架。 ...@@ -115,7 +117,7 @@ KTransformers 的核心是一个用户友好的、基于模板的注入框架。
```python ```python
with torch.device("meta"): with torch.device("meta"):
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True) model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config) optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
... ...
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000) generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
``` ```
...@@ -150,7 +152,7 @@ YAML 文件中的每个规则都有两部分:`match` 和 `replace`。`match` ...@@ -150,7 +152,7 @@ YAML 文件中的每个规则都有两部分:`match` 和 `replace`。`match`
<h2 id="ack">致谢和贡献者</h2> <h2 id="ack">致谢和贡献者</h2>
KTransformer 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile Marlin 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。 KTransformer 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。
KTransformer 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们,使 KTransformer 更快、更易于使用。 KTransformer 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们,使 KTransformer 更快、更易于使用。
......
WeChatGroup.png

829 KB | W: | H:

WeChatGroup.png

258 KB | W: | H:

WeChatGroup.png
WeChatGroup.png
WeChatGroup.png
WeChatGroup.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -22,6 +22,7 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin ...@@ -22,6 +22,7 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
<h2 id="Updates">🔥 Updates</h2> <h2 id="Updates">🔥 Updates</h2>
* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md). * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md). * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G. * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
......
...@@ -5,11 +5,12 @@ ...@@ -5,11 +5,12 @@
- [Installation Guide](en/install.md) - [Installation Guide](en/install.md)
# Tutorial # Tutorial
- [Deepseek-R1/V3 Show Case](en/DeepseekR1_V3_tutorial.md) - [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
- [Why KTransformers So Fast](en/deepseek-v2-injection.md) - [Why KTransformers So Fast](en/deepseek-v2-injection.md)
- [Injection Tutorial](en/injection_tutorial.md) - [Injection Tutorial](en/injection_tutorial.md)
- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md) - [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
# Server (Temporary Deprecated) - [Use FP8 GPU Kernel](en/fp8_kernel.md)
# Server
- [Server](en/api/server/server.md) - [Server](en/api/server/server.md)
- [Website](en/api/server/website.md) - [Website](en/api/server/website.md)
- [Tabby](en/api/server/tabby.md) - [Tabby](en/api/server/tabby.md)
...@@ -19,4 +20,6 @@ ...@@ -19,4 +20,6 @@
# FAQ # FAQ
- [FAQ](en/FAQ.md) - [FAQ](en/FAQ.md)
# V3 Reproduction # V3 Reproduction
- [Success List](en/V3-success.md) - [Success List](en/V3-success.md)
\ No newline at end of file # Benchmark
- [Benchmark](en/benchmark.md)
\ No newline at end of file
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
- [Memory consumptions:](#memory-consumptions) - [Memory consumptions:](#memory-consumptions)
- [Benchmark results](#benchmark-results-2) - [Benchmark results](#benchmark-results-2)
- [How to Run](#how-to-run) - [How to Run](#how-to-run)
- [V0.2.2 longer context \& FP8 kernel](#v022-longer-context--fp8-kernel)
- [longer context](#longer-context)
- [FP8 kernel](#fp8-kernel)
- [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase) - [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
- [Single socket version (32 cores)](#single-socket-version-32-cores) - [Single socket version (32 cores)](#single-socket-version-32-cores)
- [Dual socket version (64 cores)](#dual-socket-version-64-cores) - [Dual socket version (64 cores)](#dual-socket-version-64-cores)
...@@ -90,7 +93,7 @@ Integrated the highly efficient Triton MLA Kernel from the fantastic sglang proj ...@@ -90,7 +93,7 @@ Integrated the highly efficient Triton MLA Kernel from the fantastic sglang proj
"6 experts" case is part of V0.3's preview "6 experts" case is part of V0.3's preview
| Prompt | hi (2) | 1K (969) | 2K (1930) | 4K (3846) | llama.cpp (8 experts) | | Prompt | hi (2) | 1K (969) | 2K (1930) | 4K (3846) | 8K (7678) |
| --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- |
| Output length | 10tokens | 300tokens | 300tokens | 300tokens | 300tokens | | Output length | 10tokens | 300tokens | 300tokens | 300tokens | 300tokens |
| **6 experts V0.2.0** | | | | | | | **6 experts V0.2.0** | | | | | |
...@@ -154,6 +157,37 @@ the output quality doesn't change. But the speed of decoding and prefill ...@@ -154,6 +157,37 @@ the output quality doesn't change. But the speed of decoding and prefill
is speed up which is inspiring. So our showcase makes use of this finding* is speed up which is inspiring. So our showcase makes use of this finding*
## How to Run ## How to Run
### V0.2.2 longer context & FP8 kernel
#### longer context
To use this feature, [install flashinfer](https://github.com/flashinfer-ai/flashinfer) first.
Note: The latest MLA kernel in FlashInfer still has a few minor issues. They are continuously fixing them on the main branch. If you are using FlashInfer, please install it from the main source code.
If you want to use long context(longer than 20K) for prefill, enable the matrix absorption MLA during the prefill phase, which will significantly reduce the size of the kv cache. Modify yaml file like this:
```
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
absorb_for_prefill: True # change this to True to enable long context(prefill may slower).
```
If the VRAM is still insufficient, try reducing the `chunk_prefill_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.
#### FP8 kernel
The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
- **Hybrid Quantization Architecture**:
- Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
- Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)
So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.
The detailed guide is [here](./fp8_kernel.md).
### V0.2 & V0.2.1 Showcase ### V0.2 & V0.2.1 Showcase
#### Single socket version (32 cores) #### Single socket version (32 cores)
Our local_chat test command is: Our local_chat test command is:
...@@ -171,7 +205,7 @@ Attention! If you are testing R1 and it may skip thinking. So you can add arg: ` ...@@ -171,7 +205,7 @@ Attention! If you are testing R1 and it may skip thinking. So you can add arg: `
#### Dual socket version (64 cores) #### Dual socket version (64 cores)
Make suer before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set). You may check the doc [here](./install.md) for install details. <br> Make sure before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set). You may check the doc [here](./install.md) for install details. <br>
Test Command: Test Command:
``` shell ``` shell
...@@ -226,6 +260,7 @@ Intel is currently the only CPU vendor that supports AMX-like instructions, whic ...@@ -226,6 +260,7 @@ Intel is currently the only CPU vendor that supports AMX-like instructions, whic
### Easier ### Easier
* Official Docker images to simplify installation * Official Docker images to simplify installation
* Fix the server integration for web API access * Fix the server integration for web API access
* Fix the local chat only accepting a single line prompt (currently \n begins generating prompt)
* Support for more quantization types, including the highly requested dynamic quantization from unsloth * Support for more quantization types, including the highly requested dynamic quantization from unsloth
Stay tuned for more updates! Stay tuned for more updates!
......
...@@ -25,7 +25,7 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552 ...@@ -25,7 +25,7 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552
1. local_chat.py: You can increase the context window size by setting `--max_new_tokens` to a larger value. 1. local_chat.py: You can increase the context window size by setting `--max_new_tokens` to a larger value.
2. server: Increase the `--cache_lens' to a larger value. 2. server: Increase the `--cache_lens' to a larger value.
2. Move more weights to the GPU. 2. Move more weights to the GPU.
Refer to the ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml Refer to the ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
```yaml ```yaml
- match: - match:
name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$" # inject experts in layer 4~10 as marlin expert name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$" # inject experts in layer 4~10 as marlin expert
...@@ -39,11 +39,13 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552 ...@@ -39,11 +39,13 @@ from-https://github.com/kvcache-ai/ktransformers/issues/129#issue-2842799552
You can modify layer as you want, eg. `name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$"` to `name: "^model\\.layers\\.([4-12])\\.mlp\\.experts$"` to move more weights to the GPU. You can modify layer as you want, eg. `name: "^model\\.layers\\.([4-10])\\.mlp\\.experts$"` to `name: "^model\\.layers\\.([4-12])\\.mlp\\.experts$"` to move more weights to the GPU.
> Note: The first matched rule in yaml will be applied. For example, if you have two rules that match the same layer, only the first rule's replacement will be valid. > Note: The first matched rule in yaml will be applied. For example, if you have two rules that match the same layer, only the first rule's replacement will be valid.
> Note:Currently, executing experts on the GPU will conflict with CUDA Graph. Without CUDA Graph, there will be a significant slowdown. Therefore, unless you have a substantial amount of VRAM (placing a single layer of experts for DeepSeek-V3/R1 on the GPU requires at least 5.6GB of VRAM), we do not recommend enabling this feature. We are actively working on optimization.
> Note KExpertsTorch is untested.
### Q: If I don't have enough VRAM, but I have multiple GPUs, how can I utilize them? ### Q: If I don't have enough VRAM, but I have multiple GPUs, how can I utilize them?
Use the `--optimize_rule_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` to load the two optimized rule yaml file. You may also use it as an example to write your own 4/8 gpu optimized rule yaml file. Use the `--optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` to load the two optimized rule yaml file. You may also use it as an example to write your own 4/8 gpu optimized rule yaml file.
> Note: The ktransformers' multi-gpu stratigy is pipline, which is not able to speed up the model's inference. It's only for the model's weight distribution. > Note: The ktransformers' multi-gpu stratigy is pipline, which is not able to speed up the model's inference. It's only for the model's weight distribution.
...@@ -53,7 +55,7 @@ You have to set `--cpu_infer` to the number of cores you want to use. The more c ...@@ -53,7 +55,7 @@ You have to set `--cpu_infer` to the number of cores you want to use. The more c
### Q: My DeepSeek-R1 model is not thinking. ### Q: My DeepSeek-R1 model is not thinking.
According to DeepSeek, you need to enforce the model to initiate its response with "\<think>\n" at the beginning of every output by passing the arg `--force_think true `. According to DeepSeek, you need to enforce the model to initiate its response with "\<think>\n" at the beginning of every output by passing the arg `--force_think True `.
### Q: Loading gguf error ### Q: Loading gguf error
...@@ -61,9 +63,37 @@ Make sure you: ...@@ -61,9 +63,37 @@ Make sure you:
1. Have the `gguf` file in the `--gguf_path` directory. 1. Have the `gguf` file in the `--gguf_path` directory.
2. The directory only contains gguf files from one model. If you have multiple models, you need to separate them into different directories. 2. The directory only contains gguf files from one model. If you have multiple models, you need to separate them into different directories.
3. The folder name it self should not end with `.gguf`, eg. `Deep-gguf` is correct, `Deep.gguf` is wrong. 3. The folder name it self should not end with `.gguf`, eg. `Deep-gguf` is correct, `Deep.gguf` is wrong.
4. The file itself is not corrupted; you can verify this by checking that the sha256sum matches the one from huggingface, modelscope, or hf-mirror.
### Q: Version `GLIBCXX_3.4.30' not found ### Q: Version `GLIBCXX_3.4.30' not found
The detailed error: The detailed error:
>ImportError: /mnt/data/miniconda3/envs/xxx/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/xxx/xxx/ktransformers/./cpuinfer_ext.cpython-312-x86_64-linux-gnu.so) >ImportError: /mnt/data/miniconda3/envs/xxx/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /home/xxx/xxx/ktransformers/./cpuinfer_ext.cpython-312-x86_64-linux-gnu.so)
It may because of your conda env have no this version. Your can first exit your conda env by `conda deactivate` and use `whereis libstdc++.so.6` to find the path. And re enter your conda env and copy the .so by `cp <path of outter libstdc++> <path of your conda env libstdc++>` Running `conda install -c conda-forge libstdcxx-ng` can solve the problem.
### Q: When running the bfloat16 moe model, the data shows NaN
The detailed error:
```shell
Traceback (most recent call last):
File "/root/ktransformers/ktransformers/local_chat.py", line 183, in <module>
fire.Fire(local_chat)
File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 135, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 468, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 684, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/root/ktransformers/ktransformers/local_chat.py", line 177, in local_chat
generated = prefill_and_generate(
File "/root/ktransformers/ktransformers/util/utils.py", line 204, in prefill_and_generate
next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
File "/root/ktransformers/ktransformers/util/utils.py", line 128, in decode_one_tokens
next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
```
**SOLUTION**: The issue of running ktransformers on Ubuntu 22.04 is caused by the current system's g++ version being too old, and the pre-defined macros do not include avx_bf16. We have tested and confirmed that it works on g++ 11.4 in Ubuntu 22.04.
### Q: Using fp8 prefill very slow.
The FP8 kernel is build by JIT, so the first run will be slow. The subsequent runs will be faster.
\ No newline at end of file
...@@ -8,6 +8,20 @@ This document provides the necessary steps to set up and run the web service for ...@@ -8,6 +8,20 @@ This document provides the necessary steps to set up and run the web service for
Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher
Note: The version of Node.js in the Ubuntu or Debian GNU/Linux software repository is too low, causing compilation errors. Users can also install Node.js through the Nodesource repository, provided they uninstall the outdated version first.
```bash
# sudo apt-get remove nodejs npm -y && sudo apt-get autoremove -y
sudo apt-get update -y && sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/nodesource.gpg
sudo chmod 644 /usr/share/keyrings/nodesource.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_23.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list
sudo apt-get update -y
sudo apt-get install nodejs -y
```
Once npm is installed, navigate to the `ktransformers/website` directory: Once npm is installed, navigate to the `ktransformers/website` directory:
```bash ```bash
......
## Benchmark
To conduct a quick and convenient check, we have employed a simple Python script available [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/tests) to assess the precision of our **[ktransformers](https://github.com/kvcache-ai/ktransformers)** project. For this evaluation, we utilized the same dataset, which was shuffled in a consistent manner and limited to the first 1,000 data points, to test our implementation across a variety of CPU kernels, MLA kernels, and quantization formats.
We selected the DeepSeek-V3 model in its bf16, int8, and q4km versions for this test. The MMLU dataset, which can be found [here](https://huggingface.co/datasets/cais/mmlu), was used (we selected all datasets and shuffled them with a fixed random seed).
**!!! However, we skipped the few-shot part and only chose the first 1,000 data points for a quick check.** Please note that this approach may result in results that are not consistent with the technical report of DeepSeek-V3. And the test of R1 and further more tests are on going.
To verify our results, we chose [cloud service platform](https://cloud.siliconflow.cn/models) as baseline. All tests were conducted using the same script and datasets, allowing us to make a preliminary assessment of our project's precision.
We set the argument `temperature=0.6`, and to simplify the test process, we skipped the few-shot part and used the following prompt: `There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter. \nQuestion: {question}\nA. {option_a}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '`. For more details, please refer to the [script](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/tests/mmlu_test.py).
Given that we have only tested 1,000 cases, which provides only a preliminary judgment, some fluctuations in the results are reasonable. We selected all datasets and shuffled them with a fixed random seed to ensure consistency.
## Some Details
- The bf16 model of DeepSeek-V3 is available [here](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16/tree/main) (you may convert it to gguf by llama.cpp). The q4km model can be found [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M).
- The optimization YAML file is located [here](https://github.com/kvcache-ai/ktransformers/tree/main/ktransformers/optimize/optimize_rules). For the GEMM Kernel, you can change `KLinearMarlin` to `KLinearTorch`.
- To switch the MLA Kernel from Triton to Torch, you can check and modify [this file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py), specifically by using the `forward_windows` method.
- When attempting to conduct the bf16 test (both CPU Weight and GPU Weight), you may encounter issues stemming from older versions of g++ and as, particularly when using Ubuntu 20 or earlier versions. To facilitate a smoother experience and enable you to reproduce our results, we have provided a development container. This container offers a pre-configured environment tailored for this purpose. However, please note that the container does not have the ktrans package installed. Therefore, you may still need to manually install certain packages to ensure everything runs smoothly.
- You may config the model mount dir in `devcontainer/devcontainer.json`, check the `"mouts":` config.
## The Result Table
| | | | | | | | |
| ------------------------ | ----------------- | ---------- | ----------------- | ------- | ---------- | ------------------------------------------------------ | ------------ |
| DataSet | CPU Weight Format | CPU Kernel | GPU Weight Format | GEMM Kernel | MLA Kernel | [Siliconflow](https://cloud.siliconflow.cn/models)<br> | Ktrans Point |
| MMLU<br><br>(shuffle 1k) | | | | | | | |
| 1 | bf16 | cpuinfer | bf16 | torch | torch | 81.6 | 81.9 |
| 2 | q8_0 | cpuinfer | bf16 | torch | torch | 81.6 | 83.1 |
| 3 | q4km | cpuinfer | bf16 | torch | triton | 81.6 | 81.4 |
| 4 | q4km | cpuinfer | q4km->marlin 8 | marlin | triton | 81.6 | 81.1 |
| 5 | q4km | cpuinfer | q4km->marlin 4 | marlin | triton | 81.6 | 81 |
| 6 | q4km | cpuinfer | fp8 | fp8gemm | triton | 81.6 | 81.5 |
| MMLU-pro | | | | | | | |
| 1 | q4km | cpuinfer | fp8 | fp8gemm | triton | 57.7 | 57.6 |
| 2 | q4km | cpuinfer | q4km->marlin 4 | marlin | triton | 57.7 | 57.5 |
| HumanEval | tbd | tbd | tbd | tbd | tbd | tbd | tbd |
| GSM8K | tbd | tbd | tbd | tbd | tbd | tbd | tbd |
**The details for each case are listed below**:
By default, The MLA kernel uses triton in linux and torch in windows. But we need to test torch in linux, so we manually modify the [file](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/attention.py#L592). Just get rid of all the if branch and force it to use `self.forward_windows`
- MMLU test
1. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml) change all the `KLinearMarlin` to `KLinearTorch` (just find all the usage in this file). The source weight comes from [there](https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16) (you need to use llama.cpp to convert it to gguf)
2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q8_0 is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q8_0)
3. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You need to modify the code to separately load cpu's expert weight. We leave this as comment in these places: [1](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L122), [2](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L136), [3](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/operators/experts.py#L137) (note in 3, change the path to your local weight file path). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
4. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You don't need to change the source code as they both use q4km. But note the yaml file [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L29) and [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L18), below these lines you need to add `num_bits: 8` (in other words: add this kwargs to all that use `KLinearMarlin`). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
5. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
6. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
- MMLU-pro test
1. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
\ No newline at end of file
# FP8 Linear Kernel for DeepSeek-V3/R1
## Overview
The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
- **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
- **Hybrid Quantization Architecture**:
- Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
- Experts modules retain GGML quantization (GGUF format, reside in CPU to save GPU memory)
So those who are persuing the best performance can use the FP8 linear kernel for DeepSeek-V3/R1.
## Key Features
✅ Hybrid Precision Architecture (FP8 + GGML)<br>
✅ Memory Optimization (~19GB VRAM usage)
## Quick Start
### Using Pre-Merged Weights
Pre-merged weights are available on Hugging Face:<br>
[KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-V3)<br>
[KVCache-ai/DeepSeek-R1-GGML-FP8-Hybrid](https://huggingface.co/KVCache-ai/DeepSeek-R1)
> Please confirm the weights are fully uploaded before downloading. The large file size may extend Hugging Face upload time.
Download Pre-Merged Weights
```shell
pip install -U huggingface_hub
# Optional: Use HF Mirror for faster downloads in special area.
# export HF_ENDPOINT=https://hf-mirror.com
huggingface-cli download --resume-download KVCache-ai/DeepSeek-V3-GGML-FP8-Hybrid --local-dir <local_dir>
```
### Using merge scripts
If you got local DeepSeek-R1/V3 fp8 safetensors and gguf weights(eg.q4km), you can merge them using the following scripts.
```shell
python merge_tensors/merge_safetensor_gguf.py \
--safetensor_path <fp8_safetensor_path> \
--gguf_path <gguf_folder_path> \
--output_path <merged_output_path>
```
* `--safetensor_path`: input path of safetensor file([Download](https://huggingface.co/deepseek-ai/DeepSeek-V3/tree/main)).
* `--gguf_path`: input path of gguf folder ([Download](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)).
* `--output_path`: output path of merged file.
### Execution Notes
Launch local_chat.py with custom quantized experts
```shell
python ktransformers/local_chat.py \
--model_path deepseek-ai/DeepSeek-V3 \
--gguf_path <merged_weights_folder> \
--optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml \
--cpu_infer <cpu_cores + 1>
```
## Notes
⚠️ Hardware Requirements<br>
* Recommended minimum 19GB available VRAM for FP8 kernel.
* Requires GPU with FP8 support (e.g., 4090)
⏳ First-Run Optimization
JIT compilation causes longer initial execution (subsequent runs retain optimized speed).
🔄 Temporary Interface<br>
Current weight loading implementation is provisional - will be refined in future versions
📁 Path Specification<br>
Despite hybrid quantization, merged weights are stored as .safetensors - pass the containing folder path to `--gguf_path`
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment