Merge branch 'main' into hip

c009512a · Azure-Tang · c1f13a69 · 4f22d726 · c009512a · c009512a
Commit c009512a authored Mar 13, 2025 by Azure-Tang
20 changed files
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
+WORKDIR /workspace
+ENV CUDA_HOME /usr/local/cuda
+RUN <<EOF
+apt update -y &&  apt install -y  --no-install-recommends \
+    git \
+    wget \
+    vim \
+    gcc \
+    g++ \
+    cmake && 
+rm -rf /var/lib/apt/lists/* &&
+pip install --upgrade pip &&
+pip install ninja pyproject numpy cpufeature &&
+pip install flash-attn &&
+cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
+EOF
+# Set the default shell to bash
+CMD ["/bin/bash"]
\ No newline at end of file
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
+{
+    "name": "Ktrans Dev Container",
+    "privileged": true,
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {
+            "http_proxy": "${env:http_proxy}",
+            "https_proxy": "${env:https_proxy}",
+        }
+    },
+    "runArgs": [
+        "--network=host",
+        "--gpus",
+        "all"
+        // "--gpu all"
+    ],
+    "workspaceFolder": "/workspace",
+    "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
+    "mounts": [
+        "source=/mnt/data,target=/mnt/incontainer,type=bind,consistency=cached"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+            ],
+            "settings": {
+                "terminal.integrated.shell.linux": "/bin/bash",
+                "cmake.configureOnOpen": true,
+                "cmake.generator": "Ninja"
+            }
+        }
+    }
+}
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/-bug-.yaml
+++ b/.github/ISSUE_TEMPLATE/-bug-.yaml
+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. I have searched related issues but cannot get the expected help.
+    - label: 2. The bug has not been fixed in the latest version.
+    - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
+    - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
+    - label: 5. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-Chinese/English content without translation may be closed.
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      What command or script did you run? Which **model** are you using?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      Please provide necessary environment information here (e.g. OS/GPU/CPU). Otherwise the issue will be close.
+    placeholder: Environment here.
+  validations:
+    required: true
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/-bug2-.yaml
+++ b/.github/ISSUE_TEMPLATE/-bug2-.yaml
+name: 🐞 BUG报告
+description: 创建报告以帮助我们复现并修复BUG
+title: "[Bug] "
+labels: ['Bug']
+body:
+- type: checkboxes
+  attributes:
+    label: 检查清单
+    options:
+    - label: 1. 我已经搜索过相关问题，但未能获得预期的帮助
+    - label: 2. 该问题在最新版本中尚未修复
+    - label: 3. 请注意，如果您提交的BUG相关 issue 缺少对应环境信息和最小可复现示例，我们将难以复现和定位问题，降低获得反馈的可能性
+    - label: 4. 如果您提出的不是bug而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
+    - label: 5. 为方便社区交流，我将使用中文/英文或附上中文/英文翻译（如使用其他语言）。未附带翻译的非中文/英语内容可能会被关闭
+- type: textarea
+  attributes:
+    label: 问题描述
+    description: 清晰简洁地描述BUG是什么
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 复现步骤
+    description: |
+      你运行了什么命令或脚本？使用的是哪个**模型**？
+    placeholder: |
+      在此处填写命令
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 环境信息
+    description: |
+      请提供必要的环境信息（如操作系统/GPU/CPU），否则该 issue 将被关闭
+    placeholder: 在此处填写环境信息
+  validations:
+    required: true
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/-feature-.yaml
+++ b/.github/ISSUE_TEMPLATE/-feature-.yaml
+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
+    - label: 2. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-English/Chinese content without translation may be closed.
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/-feature2-.yaml
+++ b/.github/ISSUE_TEMPLATE/-feature2-.yaml
+name: 🚀 新功能请求
+description: 为项目提出新功能建议
+title: "[Feature] "
+body:
+- type: checkboxes
+  attributes:
+    label: 检查清单
+    options:
+    - label: 1. 如果您提出的不是新功能而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
+    - label: 2. 为方便社区交流，我将使用中文/英文或附上英文/中文翻译（如使用其他语言）。未附带翻译的非英文/中文内容可能会被关闭
+- type: textarea
+  attributes:
+    label: 需求背景
+    description: |
+      清晰简洁地描述该功能的背景需求
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 相关资源
+    description: |
+      如果有官方代码实现或第三方实现，请在此提供相关信息，这将非常有帮助
\ No newline at end of file
--- a/.github/workflows/book-ci.yml
+++ b/.github/workflows/book-ci.yml
+name: Book-CI
+on:
+  push:
+    branches:
+      - main
+      # - server_support
+  pull_request:
+    branches:
+      - main
+      # - server_support
+jobs:
+  test:
+    name: test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: |
+          rustup set profile minimal
+          rustup toolchain install stable
+          rustup default stable
+      - name: Setup mdBook
+        uses: peaceiris/actions-mdbook@v2
+        with:
+          mdbook-version: "latest"
+      # - name: Run tests
+      #   run: mdbook test
\ No newline at end of file
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
+name: Deploy
+on:
+  push:
+    branches:
+      - main
+      # - server_support
+  pull_request:
+    branches:
+      - main
+      # - server_support
+defaults:
+  run:
+    shell: bash
+permissions:
+  contents: write
+jobs:
+  deploy:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: |
+          rustup set profile minimal
+          rustup toolchain install stable
+          rustup default stable
+      - name: Setup mdBook
+        uses: peaceiris/actions-mdbook@v2
+        with:
+          mdbook-version: "latest"
+      - run: mdbook build
+      # - name: Copy Assets
+      #   run: |
+      #     chmod +x ci/copy-assets.sh
+      #     ci/copy-assets.sh ${{ matrix.os }}
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        # or || github.ref == 'refs/heads/server_support'
+        if: ${{ github.ref == 'refs/heads/main' }}
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./book
\ No newline at end of file
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
+name: DockerHub CI
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      choose:
+        description: 'Will you push the image to DockerHub? 0 for No, 1 for Yes'
+        required: true
+        default: '0'
+        type: string
+  # push:
+  #   branches:
+  #     - main
+env:
+  DOCKERHUB_REPO: ${{ secrets.DOCKERHUB_USERNAME }}/ktransformers
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run tests
+        run: |
+          if [ -f docker-compose.test.yml ]; then
+            docker-compose --file docker-compose.test.yml build
+            docker-compose --file docker-compose.test.yml run sut
+          else
+            docker build . --file Dockerfile
+          fi
+  docker_task:
+    needs: test
+    name: ${{ matrix.instruct}}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        # for amd64
+          - {instruct: "FANCY",   platform: "linux/amd64"}
+          - {instruct: "AVX512",  platform: "linux/amd64"}
+          - {instruct: "AVX2",    platform: "linux/amd64"}   
+          - {instruct: "NATIVE",  platform: "linux/amd64"}
+        # for arm64
+          - {instruct: "NATIVE",  platform: "linux/arm64"}
+    steps:
+        - name: Move Docker data directory
+          run: |
+            sudo systemctl stop docker
+            sudo mkdir -p /mnt/docker
+            sudo rsync -avz /var/lib/docker/ /mnt/docker
+            sudo rm -rf /var/lib/docker 
+            sudo ln -s /mnt/docker /var/lib/docker
+            sudo systemctl start docker
+        -
+          name: Set up QEMU
+          uses: docker/setup-qemu-action@v3
+        -
+          name: Set up Docker Buildx
+          uses: docker/setup-buildx-action@v3
+        -
+          name: Login to Docker Hub
+          uses: docker/login-action@v3
+          with:
+            username: ${{ secrets.DOCKERHUB_USERNAME }}
+            password: ${{ secrets.DOCKERHUB_TOKEN }}
+        -
+          name: Build and push for amd64
+          if: matrix.platform == 'linux/amd64'
+          uses: docker/build-push-action@v6
+          with:
+            push: true
+            platforms: |
+              linux/amd64
+            tags: |
+              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
+              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
+            build-args: |
+              CPU_INSTRUCT=${{ matrix.instruct }}
+        -
+          name: Build and push for arm64
+          if: matrix.platform == 'linux/arm64'
+          uses: docker/build-push-action@v6
+          with:
+            push: true
+            platforms: |
+              linux/arm64
+            tags: |
+              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
+              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
+            build-args: |
+              CPU_INSTRUCT=${{ matrix.instruct }}
\ No newline at end of file
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
+name: Install KTransformers
+run-name: Install KTransformers
+on:
+  workflow_dispatch:
+    inputs:
+      job_to_run:
+        description: "Which job to run?"
+        required: true
+        default: "install"
+        type: choice
+        options:
+          - create&install
+          - install
+jobs:
+  Install-KTransformers:
+    runs-on: self-hosted
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
+      - name: Remove old conda environment
+        continue-on-error: true
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda env remove --name ktransformers-dev -y
+      - name: Create conda environment
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda create --name ktransformers-dev python=3.11
+          conda activate ktransformers-dev
+          conda install -c conda-forge libstdcxx-ng -y
+      - name: Install dependencies
+        if: ${{ inputs.job_to_run == 'create&install'}}
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+          pip3 install packaging ninja cpufeature numpy
+          pip install ~/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
+      - name: Install KTransformers
+        run: |
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          cd ${{ github.workspace }}
+          git submodule init
+          git submodule update
+          USE_NUMA=1 bash install.sh
+      - run: echo "This job's status is ${{ job.status }}."
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,10 @@ ktransformers/server/local_store/
 ktransformers/server_test1.db
 *.patch
 img/
-tmp1.txt
+tmp*.txt
-test_65_300_1536.txt
 test.txt
+book
+ktransformers/tests/chat_txt.txt
+mmlu_result*
+ktransformers/ktransformers_ext/cuda_musa/
+test_prompt.txt
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,8 @@ EOF
-FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel as compile_server
+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
+ARG CPU_INSTRUCT=NATIVE
 WORKDIR /workspace
 ENV CUDA_HOME /usr/local/cuda
 COPY --from=web_compile /home/ktransformers /workspace/ktransformers
@@ -26,10 +27,12 @@ rm -rf /var/lib/apt/lists/* &&
 cd ktransformers &&
 git submodule init &&
 git submodule update &&
+pip install --upgrade pip &&
 pip install ninja pyproject numpy cpufeature &&
 pip install flash-attn &&
-CPU_INSTRUCT=NATIVE  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
+CPU_INSTRUCT=${CPU_INSTRUCT}  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" pip install . --no-build-isolation --verbose &&
-pip cache purge
+pip cache purge &&
+cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
 EOF
-ENTRYPOINT [ "/opt/conda/bin/ktransformers" ]
+ENTRYPOINT ["tail", "-f", "/dev/null"]
\ No newline at end of file
--- a/Makefile
+++ b/Makefile
@@ -18,4 +18,15 @@ dev_install:
 	echo "Installing ktransformers"
 	KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
 	echo "Installation completed successfully"
\ No newline at end of file
+clean:
+	rm -rf build
+	rm -rf *.egg-info
+	rm -rf ktransformers/ktransformers_ext/build
+	rm -rf ktransformers/ktransformers_ext/cuda/build
+	rm -rf ktransformers/ktransformers_ext/cuda/dist
+	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info	
+install_numa:
+	USE_NUMA=1 make dev_install
+install_no_numa:
+	env -u USE_NUMA make dev_install
--- a/README.md
+++ b/README.md
@@ -23,14 +23,16 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
 <h2 id="Updates">🔥 Updates</h2>
-* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md).
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
-* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md).
+* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
+* **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
+* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
-* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. 
+* **Aug 15, 2024**: Update detailed [tutorial](doc/en/injection_tutorial.md) for injection and multi-GPU. 
 * **Aug 14, 2024**: Support llamfile as linear backend. 
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
+<!-- * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). -->
 <h2 id="show-cases">🌟 Show Cases</h2>
 <div>
@@ -43,10 +45,10 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM([Tutorial](./doc/en/DeepseekR1_V3_tutorial.md)).
 	- Prefill Speed (tokens/s): 
- 		- KTransfermor: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
 		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
 	- Decode Speed (tokens/s):  
- 		- KTransfermor: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
+ 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
 		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
 	- Upcoming Open Source Release:
 		- AMX optimizations and selective expert activation will be open-sourced in V0.3.  
@@ -69,7 +71,7 @@ https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
 </p>
-<h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
+<!-- <h3>1M Context Local Inference on a Desktop with Only 24GB VRAM</h3>
 <p align="center">
 https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
@@ -91,228 +93,20 @@ https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
 * **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.
 * **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_introduction.md).
+ -->
 <strong>More advanced features will coming soon, so stay tuned!</strong>
 <h2 id="quick-start">🚀 Quick Start</h2>
-<h3>Preparation</h3>
-Some preparation:
- CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
-  ```sh
-  # Adding CUDA to PATH
-  export PATH=/usr/local/cuda/bin:$PATH
-  export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-  export CUDA_PATH=/usr/local/cuda
-  ```
- Linux-x86_64 with gcc, g++ and cmake
-  ```sh
-  sudo apt-get update
-  sudo apt-get install gcc g++ cmake ninja-build
-  ```
- We recommend using [Conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program.
-  ```sh
-  conda create --name ktransformers python=3.11
-  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
-  ```
- Make sure that PyTorch, packaging, ninja is installed
-  ```
-  pip install torch packaging ninja cpufeature numpy
-  ```
-<h3>Installation</h3>
-1. Use a Docker image, see [documentation for Docker](./doc/en/Docker.md) 
-2. You can install using Pypi (for linux):
-   ```
-   pip install ktransformers --no-build-isolation
-   ```
-   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced. 
-3. Or you can download source code and compile:
-   - init source code 
-     ```sh
-     git clone https://github.com/kvcache-ai/ktransformers.git
-     cd ktransformers
-     git submodule init
-     git submodule update
-     ```
-   - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
-   - Compile and install (for Linux)
-     ```
-     bash install.sh
-     ```
-   - Compile and install(for Windows)
-     ```
-     install.bat
-     ```
-4. If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./doc/en/makefile_usage.md) 
-<h3>Local Chat</h3>
-We provide a simple command-line local chat Python script that you can run for testing.
-> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test. 
-<h4>Run Example</h4>
-```shell
-# Begin from root of your cloned repo!
-# Begin from root of your cloned repo!!
-# Begin from root of your cloned repo!!! 
-# Download mzwing/DeepSeek-V2-Lite-Chat-GGUF from huggingface
-mkdir DeepSeek-V2-Lite-Chat-GGUF
-cd DeepSeek-V2-Lite-Chat-GGUF
-wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepSeek-V2-Lite-Chat.Q4_K_M.gguf -O DeepSeek-V2-Lite-Chat.Q4_K_M.gguf
-cd .. # Move to repo's root dir
-# Start local chat
-python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
-# python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-```
-It features the following arguments:
- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
-  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
- `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
- `--optimize_rule_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
- `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
- `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).
-<h3 id="suggested-model"> Suggested Model</h3>
-| Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
-| ------------------------------ | ---------- | ----- | --------------- | ----------------- |
-| DeepSeek-R1-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V3-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
-| DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
-| DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
-| Qwen2-57B-A14B-Instruct-q4_k_m | 33G        | 8G    | 34G             | 64G               |
-| DeepSeek-V2-Lite-q4_k_m        | 9.7G       | 3G    | 13G             | 16G               |
-| Mixtral-8x7B-q4_k_m            | 25G        | 1.6G  | 51G             | 64G               |
-| Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
-| InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |
-More will come soon. Please let us know which models you are most interested in. 
-Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
-<details>
-  <summary>Click To Show how to run other examples</summary>
-* Qwen2-57B
-  ```sh
-  pip install flash_attn # For Qwen2
-  mkdir Qwen2-57B-GGUF && cd Qwen2-57B-GGUF
-  wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2-57b-a14b-instruct-q4_k_m.gguf?download=true -O qwen2-57b-a14b-instruct-q4_k_m.gguf
-  cd ..
-  python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
-  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
-  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
-  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
-  ```
-* DeepseekV2
-  ```sh
-  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
-  # Download weights
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00001-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00002-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00003-of-00004.gguf
-  wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/DeepSeek-V2-Chat-0628-Q4_K_M/DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf -o DeepSeek-V2-Chat-0628-Q4_K_M-00004-of-00004.gguf
-  cd ..
+Getting started with KTransformers is simple! Follow the steps below to set up and start using it.
-  python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+### 📥 Installation
-  # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
+To install KTransformers, follow the official [Installation Guide](https://kvcache-ai.github.io/ktransformers/en/install.html).
-  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
-  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
-  ```
-| model name | weights download link |
-|----------|----------|
-| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
-| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
-| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
-| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
-</details>
-<!-- pin block for jump -->
-<span id='id_666'> 
-<h3>RESTful API and Web UI</h3>
-Start without website:
-```sh
-ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
-```
-Start with website:
-```sh
-ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
-```
-Or you want to start server with transformers, the model_path should include safetensors
-```bash
-ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
-```
-Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :
-<p align="center">
-  <picture>
-    <img alt="Web UI" src="https://github.com/user-attachments/assets/615dca9b-a08c-4183-bbd3-ad1362680faf" width=90%>
-  </picture>
-</p>
-More information about the RESTful API server can be found [here](doc/en/api/server/server.md). You can also find an example of integrating with Tabby [here](doc/en/api/server/tabby.md).
 <h2 id="tutorial">📃 Brief Injection Tutorial</h2>
 At the heart of KTransformers is a user-friendly, template-based injection framework. 
@@ -333,7 +127,7 @@ To utilize the provided kernels, users only need to create a YAML-based injectio
 ```python
 with torch.device("meta"):
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
-optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
 ...
 generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
 ```
@@ -368,14 +162,14 @@ If you are interested in our design principles and the implementation of the inj
 <h2 id="ack">Acknowledgment and Contributors</h2>
-The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, and Marlin. We are planning to contribute back to the community by upstreaming our modifications.
+The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, Marlin, sglang and flashinfer. We are planning to contribute back to the community by upstreaming our modifications.
 KTransformer is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformer faster and easier to use.
 <h2 id="ack">Discussion</h2>
-If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGrouop.jpg)
+If you have any questions, feel free to open an issue. Alternatively, you can join our WeChat group for further discussion. QR Code: [WeChat Group](WeChatGroup.png)
 <h2 id="FAQ">🙋 FAQ</h2>

--- a/README_ZH.md
+++ b/README_ZH.md
+<div align="center">
+  <!-- <h1>KTransformers</h1> -->
+  <p align="center">
+<picture>
+    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
+</picture>
+</p>
+  <h3>一个用于体验尖端 LLM 推理优化的灵活框架</h3>
+  <strong><a href="#show-cases">🌟 案例展示</a> | <a href="#quick-start">🚀 快速入门</a> | <a href="#tutorial">📃 教程</a> | <a href="https://github.com/kvcache-ai/ktransformers/discussions">💬 讨论</a> | <a href="#FAQ">🙋 常见问题</a> </strong>
+</div>
+<h2 id="intro">🎉 介绍</h2>
+KTransformers（发音为 Quick Transformers）旨在通过先进的内核优化和放置/并行策略来增强您对 🤗 [Transformers](https://github.com/huggingface/transformers) 的体验。
+<br/><br/>
+KTransformers 是一个以 Python 为中心的灵活框架，其核心是可扩展性。通过用一行代码实现并注入优化模块，用户可以获得与 Transformers 兼容的接口、符合 OpenAI 和 Ollama 的 RESTful API，甚至是一个简化的类似 ChatGPT 的 Web 界面。
+<br/><br/>
+我们对 KTransformers 的愿景是成为一个用于实验创新 LLM 推理优化的灵活平台。如果您需要任何其他功能，请告诉我们。
+<h2 id="Updates">🔥 更新</h2>
+* **2025 年 2 月 15 日**：为DeepSeek-V3/R1支持[FP8 GPU内核](./doc/en/fp8_kernel.md); 支持更长的上下文([教程](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context)).
+* **2025 年 2 月 15 日**：长上下文(从4K到8K，24GB VRAM) & 稍快的速度(+15%)(最快 16 Tokens/s)，文档请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 和 [在线指南](https://kvcache-ai.github.io/ktransformers/) 。
+* **2025 年 2 月 10 日**：支持 Deepseek-R1 和 V3 在单个（24GB VRAM）/多 GPU 和 382G DRAM 上运行，速度提升高达 3~28 倍。详细教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)。
+* **2024 年 8 月 28 日**：支持 InternLM2.5-7B-Chat-1M 模型下的 1M 上下文，使用 24GB 的 VRAM 和 150GB 的 DRAM。详细教程请参见 [这里](./doc/en/long_context_tutorial.md)。
+* **2024 年 8 月 28 日**：将 DeepseekV2 所需的 VRAM 从 21G 降低到 11G。
+* **2024 年 8 月 15 日**：更新了详细的 [教程](doc/en/injection_tutorial.md)，介绍注入和多 GPU 的使用。
+* **2024 年 8 月 14 日**：支持 llamfile 作为线性后端。
+* **2024 年 8 月 12 日**：支持多 GPU；支持新模型：mixtral 8\*7B 和 8\*22B；支持 q2k、q3k、q5k 在 GPU 上的去量化。
+* **2024 年 8 月 9 日**：支持 Windows。
+<h2 id="show-cases">🌟 案例展示</h2>
+<div>
+<h3>在仅 24GB VRAM 的桌面上运行 GPT-4/o1 级别的本地 VSCode Copilot</h3>
+</div>
+https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
+</p>
+- **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1**：使用其 Q4_K_M 版本，仅需 14GB VRAM 和 382GB DRAM 即可运行（教程请参见 [这里](./doc/en/DeepseekR1_V3_tutorial.md)）。
+	- 预填充速度（tokens/s）：
+ 		- KTransformers：54.21（32 核）→ 74.362（双插槽，2×32 核）→ 255.26（优化的 AMX 基 MoE 内核，仅 V0.3）→ 286.55（选择性使用 6 个专家，仅 V0.3）
+ 		- 与 llama.cpp 在 2×32 核下相比，达到 **27.79× 速度提升**。
+ 	- 解码速度（tokens/s）：
+ 		- KTransformers：8.73（32 核）→ 11.26（双插槽，2×32 核）→ 13.69（选择性使用 6 个专家，仅 V0.3）
+ 		- 与 llama.cpp 在 2×32 核下相比，达到 **3.03× 速度提升**。
+	- 即将开源发布：
+		- AMX 优化和选择性专家激活将在 V0.3 中开源。
+		- 目前仅在预览二进制分发中可用，可从 [这里](./doc/en/DeepseekR1_V3_tutorial.md) 下载。
+- **本地 236B DeepSeek-Coder-V2**：使用其 Q4_K_M 版本，仅需 21GB VRAM 和 136GB DRAM 即可运行，甚至在 [BigCodeBench](https://huggingface.co/blog/leaderboard-bigcodebench) 中得分超过 GPT4-0613。
+<p align="center">
+  <picture>
+    <img alt="DeepSeek-Coder-V2 Score" src="https://github.com/user-attachments/assets/d052924e-8631-44de-aad2-97c54b965693" width=100%>
+  </picture>
+</p>
+- **更快的速度**：通过 MoE 卸载和注入来自 [Llamafile](https://github.com/Mozilla-Ocho/llamafile/tree/main) 和 [Marlin](https://github.com/IST-DASLab/marlin) 的高级内核，实现了 2K 提示预填充 126 tokens/s 和生成 13.6 tokens/s 的速度。
+- **VSCode 集成**：封装成符合 OpenAI 和 Ollama 的 API，可无缝集成到 [Tabby](https://github.com/TabbyML/tabby) 和其他前端的后端。
+<p align="center">
+https://github.com/user-attachments/assets/4c6a8a38-05aa-497d-8eb1-3a5b3918429c
+</p>
+<!-- <h3>在仅 24GB VRAM 的桌面上进行 1M 上下文本地推理</h3>
+<p align="center"> -->
+<!-- https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12 -->
+<!-- 
+* **1M 上下文 InternLM 2.5 7B**：以全 bf16 精度运行，使用 24GB VRAM 和 150GB DRAM，可在本地桌面设置中实现。在 1M "针在干草堆中" 测试中达到 92.88% 的成功率，在 128K NIAH 测试中达到 100%。
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 128K" src="./doc/assets/needle_128K.png" width=100%>
+  </picture>
+</p>
+<p align="center">
+  <picture>
+    <img alt="Single Needle Retrieval 1000K" src="./doc/assets/needle_1M.png" width=100%>
+  </picture>
+</p>
+* **增强的速度**：使用稀疏注意力，通过 llamafile 内核实现 1M 上下文生成 16.91 tokens/s 的速度。这种方法比 llama.cpp 的全注意力方法快 10 倍以上。
+* **灵活的稀疏注意力框架**：提供了一个灵活的块稀疏注意力框架，用于 CPU 卸载解码。与 SnapKV、Quest 和 InfLLm 兼容。更多信息请参见 [这里](./doc/en/long_context_introduction.md)。 -->
+<strong>更多高级功能即将推出，敬请期待！</strong>
+<h2 id="quick-start">🚀 快速入门</h2>
+KTransformers 的入门非常简单！请参考我们的[安装指南]((https://kvcache-ai.github.io/ktransformers/))进行安装。
+<h2 id="tutorial">📃 简要注入教程</h2>
+KTransformers 的核心是一个用户友好的、基于模板的注入框架。这使得研究人员可以轻松地将原始 torch 模块替换为优化的变体。它还简化了多种优化的组合过程，允许探索它们的协同效应。
+</br>
+<p align="center">
+  <picture>
+    <img alt="Inject-Struction" src="https://github.com/user-attachments/assets/6b4c1e54-9f6d-45c5-a3fc-8fa45e7d257e" width=65%>
+  </picture>
+</p>
+鉴于 vLLM 已经是一个用于大规模部署优化的优秀框架，KTransformers 特别关注受资源限制的本地部署。我们特别关注异构计算时机，例如量化模型的 GPU/CPU 卸载。例如，我们支持高效的 <a herf="https://github.com/Mozilla-Ocho/llamafile/tree/main">Llamafile</a> 和<a herf="https://github.com/IST-DASLab/marlin">Marlin</a> 内核，分别用于 CPU 和 GPU。 更多详细信息可以在 <a herf="doc/en/operators/llamafile.md">这里</a>找到。
+<h3>示例用法</h3>
+要使用提供的内核，用户只需创建一个基于 YAML 的注入模板，并在使用 Transformers 模型之前添加对 `optimize_and_load_gguf` 的调用。
+```python
+with torch.device("meta"):
+    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
+...
+generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens=1000)
+```
+在这个示例中，首先在 meta 设备上初始化 AutoModel，以避免占用任何内存资源。然后，`optimize_and_load_gguf` 遍历模型的所有子模块，匹配您的 YAML 规则文件中指定的规则，并将它们替换为指定的高级模块。
+注入后，原始的 `generate` 接口仍然可用，但我们还提供了一个兼容的 `prefill_and_generate` 方法，这使得可以进一步优化，例如使用 CUDAGraph 提高生成速度。
+<h3>如何自定义您的模型</h3>
+一个详细的使用 DeepSeek-V2 作为示例的注入和 multi-GPU 教程在 [这里](doc/en/injection_tutorial.md)。
+以下是一个将所有原始 Linear 模块替换为 Marlin 的 YAML 模板示例，Marlin 是一个高级的 4 位量化内核。
+```yaml
+- match:
+    name: "^model\\.layers\\..*$"  # 正则表达式 
+    class: torch.nn.Linear  # 仅匹配同时符合名称和类的模块
+  replace:
+    class: ktransformers.operators.linear.KTransformerLinear  # 量化数据类型的优化内核
+    device: "cpu"   # 初始化时加载该模块的 device
+    kwargs:
+      generate_device: "cuda"
+      generate_linear_type: "QuantizedLinearMarlin"
+```
+YAML 文件中的每个规则都有两部分：`match` 和 `replace`。`match` 部分指定应替换的模块，`replace` 部分指定要注入到模型中的模块以及初始化关键字。
+您可以在 [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) 目录中找到用于优化 DeepSeek-V2 和 Qwen2-57B-A14 的示例规则模板。这些模板用于为 `local_chat.py` 示例提供支持。
+如果您对我们的设计原则和注入框架的实现感兴趣，请参考 [设计文档](doc/en/deepseek-v2-injection.md)。
+<h2 id="ack">致谢和贡献者</h2>
+KTransformer 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile 、 Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。
+KTransformer 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们，使 KTransformer 更快、更易于使用。
+<h2 id="ack">讨论</h2>
+如果您有任何问题，欢迎随时提出 issue。或者，您可以加入我们的微信群进行进一步讨论。二维码： [微信群](WeChatGroup.png)
+<h2 id="FAQ">🙋 常见问题</h2>
+一些常见问题的答案可以在 [FAQ](doc/en/FAQ.md) 中找到。 
--- a/WeChatGrouop.jpg
+++ b/WeChatGrouop.jpg
--- a/WeChatGroup.png
+++ b/WeChatGroup.png
--- a/book.toml
+++ b/book.toml
+[book]
+authors = ["kvcache-ai"]
+language = "zh-CN"
+title = "Ktransformers"
+src = "doc"
+[output.html]
+git-repository-url = "https://github.com/kvcache-ai/ktransformers"
+edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"
+[output.html.playground]
+editable = true
+copy-js = true
+# line-numbers = true
+[output.html.fold]
+enable = true
+level = 0
\ No newline at end of file
--- a/doc/README.md
+++ b/doc/README.md
+<div align="center">
+  <!-- <h1>KTransformers</h1> -->
+  <p align="center">
+<picture>
+    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
+</picture>
+</p>
+</div>
+<h2 id="intro">🎉 Introduction</h2>
+KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
+<br/><br/>
+KTransformers is a flexible, Python-centric framework designed with extensibility at its core. 
+By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
+interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. 
+<br/><br/>
+Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
+<h2 id="Updates">🔥 Updates</h2>
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
+* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
+* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
+* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
+* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU. 
+* **Aug 14, 2024**: Support llamfile as linear backend. 
+* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
+* **Aug 9, 2024**: Support windows native.
--- a/doc/SUMMARY.md
+++ b/doc/SUMMARY.md
+# Ktransformer
+[Introduction](./README.md)
+# Install
+- [Installation Guide](en/install.md)
+# Tutorial 
+- [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
+- [Why KTransformers So Fast](en/deepseek-v2-injection.md)
+- [Injection Tutorial](en/injection_tutorial.md)
+- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
+- [Use FP8 GPU Kernel](en/fp8_kernel.md)
+# Server
+  - [Server](en/api/server/server.md)
+  - [Website](en/api/server/website.md)
+  - [Tabby](en/api/server/tabby.md)
+# For Developer
+- [Makefile Usage](en/makefile_usage.md)
+# FAQ
+- [FAQ](en/FAQ.md)
+# V3 Reproduction
+- [Success List](en/V3-success.md)
+# Benchmark
+- [Benchmark](en/benchmark.md)
\ No newline at end of file