Merge pull request #1013 from kvcache-ai/work-concurrent

In v0.2.4 version, we’ve added highly desired multi-concurrency support to the community through a major refactor of the whole architecture.

Merge pull request #1013 from kvcache-ai/work-concurrent
In v0.2.4 version, we’ve added highly desired multi-concurrency support to the community through a major refactor of the whole architecture.
a41d2163 · wang jiahao · GitHub · f142f4df · 4ed9744e · a41d2163
Unverified Commit a41d2163 authored Apr 02, 2025 by wang jiahao Committed by GitHub Apr 02, 2025
20 changed files
--- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.h
--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
@@ -341,7 +341,8 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
    }, nullptr);
 }

-void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
+void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend) {
+    qlen = batch_size_tensor[0];
    if (qlen < config_.group_min_len) {
        for (int i = 0; i < qlen; i++) {
            forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
@@ -350,5 +351,7 @@ void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weig
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    forward_many(forward_len, k, expert_ids, weights, input, output, backend);
-    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+
+    batch_size_tensor[0] -= forward_len;
+    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), batch_size_tensor, backend);
 }
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.h
@@ -53,7 +53,7 @@ class MOE {
    void warm_up(Backend* backend);
    void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
    void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
-    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
+    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend);

   private:
    MOEConfig config_;

--- a/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
--- a/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
--- a/ktransformers/ktransformers_ext/vendors/cuda.h
+++ b/ktransformers/ktransformers_ext/vendors/cuda.h
--- a/ktransformers/ktransformers_ext/vendors/hip.h
+++ b/ktransformers/ktransformers_ext/vendors/hip.h
--- a/ktransformers/ktransformers_ext/vendors/musa.h
+++ b/ktransformers/ktransformers_ext/vendors/musa.h
--- a/ktransformers/ktransformers_ext/vendors/vendor.h
+++ b/ktransformers/ktransformers_ext/vendors/vendor.h
--- a/doc/README.md
+++ b/doc/README.md
@@ -22,13 +22,14 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin

 <h2 id="Updates">🔥 Updates</h2>

+* **Mar 27, 2025**: Support Multi-concurrency.
 * **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./en/ROCm.md)).
 * **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./en/fp8_kernel.md) weights. Support 139K [Longer Context](./en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
 * **Feb 25, 2025**: Support [FP8 GPU kernel](./en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
-* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU. 
-* **Aug 14, 2024**: Support llamfile as linear backend. 
+* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU.
+* **Aug 14, 2024**: Support llamfile as linear backend.
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
--- a/doc/SUMMARY.md
+++ b/doc/SUMMARY.md
@@ -23,4 +23,28 @@
 # V3 Reproduction
 - [Success List](en/V3-success.md)
 # Benchmark
- [Benchmark](en/benchmark.md)
\ No newline at end of file
+- [Benchmark](# Ktransformer
+
+[Introduction](./README.md)
+# Install
+- [Installation Guide](en/install.md)
+
+# Tutorial 
+- [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
+- [Why KTransformers So Fast](en/deepseek-v2-injection.md)
+- [Injection Tutorial](en/injection_tutorial.md)
+- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
+- [Use FP8 GPU Kernel](en/fp8_kernel.md)
+# Server
+  - [Server](en/api/server/server.md)
+  - [Website](en/api/server/website.md)
+  - [Tabby](en/api/server/tabby.md)
+# For Developer
+- [Makefile Usage](en/makefile_usage.md)
+
+# FAQ
+- [FAQ](en/FAQ.md)
+# V3 Reproduction
+- [Success List](en/V3-success.md)
+# Benchmark
+- [Benchmark](
\ No newline at end of file
--- a/doc/en/DeepseekR1_V3_tutorial.md
+++ b/doc/en/DeepseekR1_V3_tutorial.md
 <!-- omit in toc -->
+
 # GPT-4/o1-level Local VSCode Copilot on a Desktop with only 24GB VRAM
+
 - [SUMMARY](#summary)
-	- [Show Case Environment](#show-case-environment)
-	- [Bench Result](#bench-result)
-		- [V0.2.1](#v021)
-			- [Memory consumption:](#memory-consumption)
-			- [Change Log](#change-log)
-			- [Benchmark Results](#benchmark-results)
-		- [V0.2](#v02)
-			- [Settings](#settings)
-			- [Memory consumption:](#memory-consumption-1)
-			- [Benchmark Results](#benchmark-results-1)
-		- [V0.3-Preview](#v03-preview)
-			- [Settings](#settings-1)
-			- [Memory consumptions:](#memory-consumptions)
-			- [Benchmark results](#benchmark-results-2)
-	- [How to Run](#how-to-run)
-		- [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
-			- [longer context](#longer-context)
-			- [FP8 kernel](#fp8-kernel)
-		- [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
-			- [Single socket version (32 cores)](#single-socket-version-32-cores)
-			- [Dual socket version (64 cores)](#dual-socket-version-64-cores)
-		- [V0.3 Showcase](#v03-showcase)
-			- [Dual socket version (64 cores)](#dual-socket-version-64-cores-1)
-	- [Some Explanations](#some-explanations)
-	- [Next](#next)
-		- [Faster](#faster)
-		- [Easier](#easier)
-	- [FAQ](#faq)
-		- [R1 No Thinking](#r1-no-thinking)
-		- [More FAQ](#more-faq)
+  - [Show Case Environment](#show-case-environment)
+  - [Bench Result](#bench-result)
+    - [V0.2.1](#v021)
+      - [Memory consumption:](#memory-consumption)
+      - [Change Log](#change-log)
+      - [Benchmark Results](#benchmark-results)
+    - [V0.2](#v02)
+      - [Settings](#settings)
+      - [Memory consumption:](#memory-consumption-1)
+      - [Benchmark Results](#benchmark-results-1)
+    - [V0.3-Preview](#v03-preview)
+      - [Settings](#settings-1)
+      - [Memory consumptions:](#memory-consumptions)
+      - [Benchmark results](#benchmark-results-2)
+  - [How to Run](#how-to-run)
+    - [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
+      - [longer context](#longer-context)
+      - [FP8 kernel](#fp8-kernel)
+    - [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
+      - [Single socket version (32 cores)](#single-socket-version-32-cores)
+      - [Dual socket version (64 cores)](#dual-socket-version-64-cores)
+    - [V0.3 Showcase](#v03-showcase)
+      - [Dual socket version (64 cores)](#dual-socket-version-64-cores-1)
+  - [Some Explanations](#some-explanations)
+  - [Next](#next)
+    - [Faster](#faster)
+    - [Easier](#easier)
+  - [FAQ](#faq)
+    - [R1 No Thinking](#r1-no-thinking)
+    - [More FAQ](#more-faq)

 # SUMMARY

 > **Feb 10, 2025**: Support DeepseekR1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup.<br>

-Hi, we're the KTransformers team (formerly known for our local CPU/GPU hybrid inference open source project with DeepSeek-V2).  
+Hi, we're the KTransformers team (formerly known for our local CPU/GPU hybrid inference open source project with DeepSeek-V2).

-We've heard your requests for DeepSeek-R1/V3 support—and we're excited to finally deliver! 
+We've heard your requests for DeepSeek-R1/V3 support—and we're excited to finally deliver!
 Apologies for the wait, but we've been cooking up something truly amazing!

-Today, we're proud to announce that we not only support DeepSeek-R1/V3, as showcased in the video below:  
+Today, we're proud to announce that we not only support DeepSeek-R1/V3, as showcased in the video below:

 https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

 </p>

 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM.
-	- Prefill Speed (tokens/s): 
- 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
- 	- Decode Speed (tokens/s):  
- 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
-  
+  - Prefill Speed (tokens/s):
+    - KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
+    - Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
+  - Decode Speed (tokens/s):
+    - KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
+    - Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.

 We also give our upcoming optimizations previews, including an Intel AMX-accelerated kernel and a selective expert activation method, which will significantly enhance performance. With V0.3-preview, we achieve up to 286 tokens/s for prefill, making it up to **28× faster than llama.cpp** for local inference.
-The binary distribution is available now and the source code will come ASAP! Check out the wheel package [here](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl)  
+The binary distribution is available now and the source code will come ASAP! Check out the wheel package [here](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl)

 > **Feb 15, 2025**: KTransformers V0.2.1: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%) (Up to 16 Tokens/s), update docs [here](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).

 We speed up the decode and prefill speed a littlt bit. The reason for the limited performance improvement mainly lies in the fact that the inference process is still constrained by the CPU's computational speed and memory bandwidth. The MLA part handled by the GPU accounts for a relatively small proportion.

 Besides the improvements in speed, we've also significantly updated the documentation to enhance usability, including:<br>
+
 - Added Multi-GPU configuration tutorial.
 - Consolidated installation guide.
 - Add a detailed tutorial on registering extra GPU memory with ExpertMarlin;

-
 ## Show Case Environment
+
 We run our best performance tests (V0.2) on <br>
 CPU: Intel (R) Xeon (R) Gold 6454S 1T DRAM (2 NUMA nodes) <br>
 GPU: 4090D 24G VRAM <br>
 Memory: standard DDR5-4800 server DRAM (1 TB), each socket with 8×DDR5-4800
+
 ## Bench Result
+
 ### V0.2.1
+
 - Model: DeepseekV3-q4km (int4)<br>
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
 - GPU: 4090 24G VRAM
 - We test after enough warm up
+
 #### Memory consumption:
-  - Single socket: 382G DRAM, at least 14GB VRAM
-  - Dual socket: 1T DRAM, at least 14GB VRAM
+
+- Single socket: 382G DRAM, at least 14GB VRAM
+- Dual socket: 1T DRAM, at least 14GB VRAM
+
 #### Change Log
+
 - Longer Context (from 4K to 8K for 24GB VRAM) and Slightly Faster Speed （+15%):<br>
-Integrated the highly efficient Triton MLA Kernel from the fantastic sglang project, enable much longer context length and slightly faster prefill/decode speed
+  Integrated the highly efficient Triton MLA Kernel from the fantastic sglang project, enable much longer context length and slightly faster prefill/decode speed
 - We suspect that some of the improvements come from the change of hardware platform (4090D->4090)
-#### Benchmark Results

+#### Benchmark Results

 "6 experts" case is part of V0.3's preview


-| Prompt | hi (2) | 1K (969) | 2K (1930) | 4K (3846) | 8K (7678) | 
-| --- | --- | --- | --- | --- | --- | 
-| Output length | 10tokens | 300tokens | 300tokens | 300tokens | 300tokens | 
-| **6 experts V0.2.0** |  |  |  |  |  |
-| Prefill token/s | 13 | 105 | 102 | 88 | CUDA OOM |
-| decode token/s | 16.8 | 15.4 | 14.2 | 13.0 | CUDA OOM |
-| **6 experts V0.2.1** |   |   |   |   |   |
-| Prefill token/s | 13 | 111 | 112.5 | 102 **(1.16x speedup)** | 101 |
-| decode token/s | 16.8 | 15.9 | 15.4 | 14.9 **(1.15x speedup)** | 13.9 |
-| **8 experts V0.2.1** |   |   |   |   |   |
-| Prefill token/s | 12.2 | 88.2 | 88.5 | 81.9 | 80 |
-| Decode token/s | 13.4 | 13.5 | 13.4 | 13.2 | 12.4 |
-
+| Prompt               | hi (2)   | 1K (969)  | 2K (1930) | 4K (3846)               | 8K (7678) |
+| -------------------- | -------- | --------- | --------- | ----------------------- | --------- |
+| Output length        | 10tokens | 300tokens | 300tokens | 300tokens               | 300tokens |
+| **6 experts V0.2.0** |          |           |           |                         |           |
+| Prefill token/s      | 13       | 105       | 102       | 88                      | CUDA OOM  |
+| decode token/s       | 16.8     | 15.4      | 14.2      | 13.0                    | CUDA OOM  |
+| **6 experts V0.2.1** |          |           |           |                         |           |
+| Prefill token/s      | 13       | 111       | 112.5     | 102**(1.16x speedup)**  | 101       |
+| decode token/s       | 16.8     | 15.9      | 15.4      | 14.9**(1.15x speedup)** | 13.9      |
+| **8 experts V0.2.1** |          |           |           |                         |           |
+| Prefill token/s      | 12.2     | 88.2      | 88.5      | 81.9                    | 80        |
+| Decode token/s       | 13.4     | 13.5      | 13.4      | 13.2                    | 12.4      |

 ### V0.2
+
 #### Settings
+
 - Model: DeepseekV3-q4km (int4)<br>
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
 - GPU: 4090D 24G VRAM
 - We test after enough warm up
+
 #### Memory consumption:
-  - Single socket: 382G DRAM, at least 14GB VRAM
-  - Dual socket: 1T DRAM, at least 14GB VRAM
+
+- Single socket: 382G DRAM, at least 14GB VRAM
+- Dual socket: 1T DRAM, at least 14GB VRAM

 #### Benchmark Results

 "6 experts" case is part of V0.3's preview

-| Prompt<br>(500 tokens) | Dual socket Ktrans (6 experts) | Dual socket Ktrans (8 experts) | Single socket Ktrans (6 experts) | Single socket Ktrans (8 experts)| llama.cpp (8 experts) | 
-| --- | --- | --- | --- | --- | --- | 
-| Prefill token/s | 97.32 | 82.94 | 65.14 | 54.21 | 10.31 |
-| Decode token/s | 13.69 | 12.208 | 10.303 | 8.73 |4.51 |
+
+| Prompt<br>(500 tokens) | Dual socket Ktrans (6 experts) | Dual socket Ktrans (8 experts) | Single socket Ktrans (6 experts) | Single socket Ktrans (8 experts) | llama.cpp (8 experts) |
+| ---------------------- | ------------------------------ | ------------------------------ | -------------------------------- | -------------------------------- | --------------------- |
+| Prefill token/s        | 97.32                          | 82.94                          | 65.14                            | 54.21                            | 10.31                 |
+| Decode token/s         | 13.69                          | 12.208                         | 10.303                           | 8.73                             | 4.51                  |

 **The highest speedup reaches up to <u>3.03x</u> in decoding and <u>9.44x</u> in prefill.**

 ### V0.3-Preview
+
 #### Settings
+
 - Model: DeepseekV3-BF16 (online quant into int8 for CPU and int4 for GPU)
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 socket, 2 numa nodes
 - GPU: (1~4)x 4090D 24GVRAM (requires more VRAM for longer prompt)

 #### Memory consumptions:
+
 - 644GB DRAM, at least 14GB VRAM

 #### Benchmark results
-| Prompt length  | 1K  | 2K  | 4K  | 8K |
-|---------------|-----|-----|-----|-----|
-| KTrans (8 experts) Prefill token/s |   185.96  |  255.26   |  252.58   |  195.62   |
-| KTrans (6 experts) Prefill token/s |   203.70  |  286.55   |  271.08   |  207.20   |
+
+
+| Prompt length                      | 1K     | 2K     | 4K     | 8K     |
+| ---------------------------------- | ------ | ------ | ------ | ------ |
+| KTrans (8 experts) Prefill token/s | 185.96 | 255.26 | 252.58 | 195.62 |
+| KTrans (6 experts) Prefill token/s | 203.70 | 286.55 | 271.08 | 207.20 |

 **The prefill of KTrans V0.3 is up to <u>3.45x</u> times faster than KTrans V0.2, and is up to <u>27.79x</u> times faster than llama.cpp.**
 **The decoding speed is the same as KTrans V0.2 (6 experts version) so it is omitted**

-The main acceleration comes from 
+The main acceleration comes from
+
 - Intel AMX instruction set and our specially designed cache friendly memory layout
 - Expert selection strategy that selects fewer experts based on offline profile results of out of domain data

-
-*From our research on DeepSeekV2, DeepSeekV3 and DeepSeekR1, 
-when we slightly decrease the activation experts num in inference, 
-the output quality doesn't change. But the speed of decoding and prefill 
+*From our research on DeepSeekV2, DeepSeekV3 and DeepSeekR1,
+when we slightly decrease the activation experts num in inference,
+the output quality doesn't change. But the speed of decoding and prefill
 is speed up which is inspiring. So our showcase makes use of this finding*

 ## How to Run
+
+### v0.2.4 
+We provide a server script, which supports multi-concurrency functionality in version v0.2.4.
+
+```
+python ktransformers/server/main.py --model_path /mnt/data/models/DeepSeek-V3 --gguf_path /mnt/data/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M/ --cpu_infer 62 --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
+```
+It features the following arguments:
+
+- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
+- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
+- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
+- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)
+
 ### v0.2.2 & v0.2.3 longer context & FP8 kernel
+
 #### longer context
+
 To use this feature, [install flashinfer](https://github.com/flashinfer-ai/flashinfer) first.

 Note: The latest MLA kernel in FlashInfer still has a few minor issues. They are continuously fixing them on the main branch. If you are using FlashInfer, please install it from the main source code.

 If you want to use long context(longer than 20K) for prefill, enable the matrix absorption MLA during the prefill phase, which will significantly reduce the size of the kv cache. Modify yaml file like this:
+
 ```
 - match:
    name: "^model\\.layers\\..*\\.self_attn$"
@@ -175,10 +210,12 @@ If you want to use long context(longer than 20K) for prefill, enable the matrix
      absorb_for_prefill: True # change this to True to enable long context(prefill may slower).
 ```

-If the VRAM is still insufficient, try reducing the `chunk_prefill_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.
+If the VRAM is still insufficient, try reducing the `chunk_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.
+
 #### FP8 kernel

 The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
+
 - **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
 - **Hybrid Quantization Architecture**:
  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
@@ -189,16 +226,20 @@ So those who are persuing the best performance can use the FP8 linear kernel for
 The detailed guide is [here](./fp8_kernel.md).

 ### V0.2 & V0.2.1 Showcase
+
 #### Single socket version (32 cores)
+
 Our local_chat test command is:
-``` shell
+
+```shell
 numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
-`<your model path>` can be local or set from online hugging face like deepseek-ai/DeepSeek-V3. If online encounters connection problem, try use mirror (hf-mirror.com) <br>
+
+`<your model path>` can be local or set from online huggingface like deepseek-ai/DeepSeek-V3. If online encounters connection problem, try use mirror (hf-mirror.com) <br>
 `<your gguf path>` can also be online, but as its large we recommend you download it and quantize the model to what you want (notice it's the dir path) <br>
 `--max_new_tokens 1000` is the max output token length. If you find the answer is truncated, you
-can increase the number for longer answer (But be aware of OOM, and increase it will slow down the generation rate.). 
+can increase the number for longer answer (But be aware of OOM, and increase it will slow down the generation rate.).

 The command `numactl -N 1 -m 1` aims to advoid data transfer between numa nodes<br>
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. This is explained in [FAQ](#faq) part
@@ -208,7 +249,8 @@ Attention! If you are testing R1 and it may skip thinking. So you can add arg: `
 Make sure before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set). You may check the doc [here](./install.md) for install details. <br>

 Test Command:
-``` shell
+
+```shell
 # ---For those who have not installed ktransformers---
 # git clone https://github.com/kvcache-ai/ktransformers.git
 # cd ktransformers
@@ -220,53 +262,65 @@ Test Command:
 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
+
 The parameters' meaning is the same. But As we use dual socket, we set cpu_infer to 65

 ### V0.3 Showcase
+
 #### Dual socket version (64 cores)
+
 Our local_chat test command is:
-``` shell
+
+```shell
 wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
+
 The parameters' meaning is the same with V0.2. But As we  use dual socket, we set cpu_infer to 65

 ## Some Explanations
-1. Also we want to make further use of our two NUMA nodes on Xeon Gold cpu. 
-To avoid the cost of data transfer between nodes, we "copy" the critical matrix on 
-both nodes which takes more memory consumption but accelerates the prefill and decoding process.
-But this method takes huge memory and slow when loading weights, So be patient when loading
-and monitor the memory usage. We are going to optimize this huge memory overhead. Stay tuned~ <br>
-2. The command args `--cpu_infer 65` specifies how many cores to use (it's ok that it exceeds the physical number, 
-but it's not the more the better. Adjust it slightly lower to your actual number of cores)<br>

+1. Also we want to make further use of our two NUMA nodes on Xeon Gold cpu.
+   To avoid the cost of data transfer between nodes, we "copy" the critical matrix on
+   both nodes which takes more memory consumption but accelerates the prefill and decoding process.
+   But this method takes huge memory and slow when loading weights, So be patient when loading
+   and monitor the memory usage. We are going to optimize this huge memory overhead. Stay tuned~ <br>
+2. The command args `--cpu_infer 65` specifies how many cores to use (it's ok that it exceeds the physical number,
+   but it's not the more the better. Adjust it slightly lower to your actual number of cores)<br>
 3. Why CPU/GPU Hybrid Inference?
-DeepSeek's MLA operators are highly computationally intensive. While running everything on CPU is possible, offloading the heavy computations to the GPU results in a massive performance boost.  
-
+   DeepSeek's MLA operators are highly computationally intensive. While running everything on CPU is possible, offloading the heavy computations to the GPU results in a massive performance boost.
 4. Where Does the Speedup Come From?

-   - Expert Offload: Unlike traditional layer-based or KVCache offloading (as seen in llama.cpp), we offload the expert computation to the CPU and MLA/KVCache to GPU, aligning perfectly with DeepSeek’s architecture for optimal efficiency.  
-   - Intel AMX Optimization – Our AMX-accelerated kernel is meticulously tuned, running several times faster than existing llama.cpp implementations. We plan to open-source this kernel after cleansing and are considering upstream contributions to llama.cpp.  
-
+   - Expert Offload: Unlike traditional layer-based or KVCache offloading (as seen in llama.cpp), we offload the expert computation to the CPU and MLA/KVCache to GPU, aligning perfectly with DeepSeek’s architecture for optimal efficiency.
+   - Intel AMX Optimization – Our AMX-accelerated kernel is meticulously tuned, running several times faster than existing llama.cpp implementations. We plan to open-source this kernel after cleansing and are considering upstream contributions to llama.cpp.
 5. Why Intel CPUs?
-Intel is currently the only CPU vendor that supports AMX-like instructions, which delivers significantly better performance compared to AVX-only alternatives.
+   Intel is currently the only CPU vendor that supports AMX-like instructions, which delivers significantly better performance compared to AVX-only alternatives.
+
 ## Next
+
 ### Faster
+
 * The FlashInfer (https://github.com/flashinfer-ai/flashinfer) project is releasing an even more efficient fused MLA operator, promising further speedups
 * vLLM has explored multi-token prediction in DeepSeek-V3, and support is on our roadmap for even better performance
 * We are collaborating with Intel to enhance the AMX kernel (v0.3) and optimize for Xeon6/MRDIMM
+
 ### Easier
+
 * Official Docker images to simplify installation
 * Fix the server integration for web API access
 * Fix the local chat only accepting a single line prompt (currently \n begins generating prompt)
 * Support for more quantization types, including the highly requested dynamic quantization from unsloth

-Stay tuned for more updates! 
+Stay tuned for more updates!
+
 ## FAQ
+
 ### R1 No Thinking
+
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. The detail is in [FAQ](./FAQ.md) part <br>

 ### More FAQ
+
 [See detail](./FAQ.md)
--- a/doc/en/balance-serve.md
+++ b/doc/en/balance-serve.md
+# Balance Serve backend (multi-concurrency) for ktransformers
+
+## KTransformers v0.2.4 Release Notes
+We are excited to announce the official release of the long-awaited **KTransformers v0.2.4**!
+In this version, we’ve added highly desired **multi-concurrency** support to the community through a major refactor of the whole architecture, updating more than 10,000 lines of code.
+By drawing inspiration from the excellent architecture of sglang, we have implemented high-performance asynchronous concurrent scheduling in C++, including features like continuous batching, chunked prefill, and more. Thanks to GPU sharing in concurrent scenarios, overall throughput is also improved to a certain extent. The following is a demonstration:
+
+
+
+https://github.com/user-attachments/assets/faa3bda2-928b-45a7-b44f-21e12ec84b8a
+
+</p>
+
+### 🚀 Key Updates
+1. Multi-Concurrency Support
+   - Added capability to handle multiple concurrent inference requests. Supports receiving and executing multiple tasks simultaneously.
+   - We implemented [custom_flashinfer](https://github.com/kvcache-ai/custom_flashinfer/tree/fix-precision-mla-merge-main) based on the high-performance and highly flexible operator library [flashinfer](https://github.com/flashinfer-ai/flashinfer/), and achieved a variable batch size CUDA Graph, which further enhances flexibility while reducing memory and padding overhead.
+   - In our benchmarks, overall throughput improved by approximately 130% under 4-way concurrency.
+   - With support from Intel, we tested KTransformers v0.2.4 on the latest Xeon6 + MRDIMM-8800 platform. By increasing concurrency, the total output throughput increased from 17 tokens/s to 40 tokens/s. We observed that the bottleneck has now shifted to the GPU. Using a higher-end GPU than the 4090D could further improve performance.
+2. Engine Architecture Optimization
+ ![image](https://github.com/user-attachments/assets/f5f001fa-dca7-4377-a01a-32192902aa47)
+  Inspired by the scheduling framework of sglang, we refactored KTransformers with a clearer three-layer architecture through an update of 11,000 lines of code, now supporting full multi-concurrency:
+   - Server：Handles user requests and serves the OpenAI-compatible API.
+   - Inference Engine：Executes model inference and supports chunked prefill.
+   - Scheduler：Manages task scheduling and requests orchestration. Supports continuous batching by organizing queued requests into batches in a FCFS manner and sending them to the inference engine.
+3. Project Structure Reorganization
+All C/C++ code is now centralized under the /csrc directory.
+4. Parameter Adjustments
+Removed some legacy and deprecated launch parameters for a cleaner configuration experience.
+We plan to provide a complete parameter list and detailed documentation in future releases to facilitate flexible configuration and debugging.
+### 📚 Upgrade Notes
+- Due to parameter changes, users who have installed previous versions are advised to delete the ~/.ktransformers directory and reinitialize.
+- To enable multi-concurrency, please refer to the latest documentation for configuration examples.
+### What's Changed
+Implemented **custom_flashinfer** @Atream @ovowei @qiyuxinlin
+Implemented **balance_serve** engine based on **FlashInfer** @qiyuxinlin @ovowei 
+Implemented a **continuous batching** scheduler in C++ @ErvinXie 
+release: bump version v0.2.4 by @Atream @Azure-Tang @ErvinXie  @qiyuxinlin @ovowei @KMSorSMS @SkqLiao 
+
+
+
+## Installation Guide
+
+### 1. Set Up Conda Environment
+
+We recommend using Miniconda3/Anaconda3 for environment management:
+
+```bash
+# Download Miniconda
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+
+# Create environment
+conda create --name ktransformers python=3.11
+conda activate ktransformers
+
+# Install required libraries
+conda install -c conda-forge libstdcxx-ng
+
+# Verify GLIBCXX version (should include 3.4.32)
+strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
+```
+
+> **Note:** Adjust the Anaconda path if your installation directory differs from `~/anaconda3`
+
+### 2. Install dependencies
+
+```bash
+sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libfmt-dev libgflags-dev zlib1g-dev patchelf
+```
+
+### 3. Build ktransformers
+
+```bash
+# Clone repository
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule update --init --recursive
+
+
+# Install single NUMA dependencies
+sudo env USE_BALANCE_SERVE=1 PYTHONPATH="$(which python)" PATH="$(dirname $(which python)):$PATH" bash ./install.sh
+# Or Install Dual NUMA dependencies
+sudo env USE_BALANCE_SERVE=1 USE_NUMA=1 PYTHONPATH="$(which python)" PATH="$(dirname $(which python)):$PATH" bash ./install.sh
+```
+
+## Running DeepSeek-R1-Q4KM Models
+
+### 1. Run for 24GB VRAM GPUs
+
+Use our optimized configuration for constrained VRAM:
+
+```bash
+python ktransformers/server/main.py \
+  --port 10002
+  --model_path <path_to_safetensor_config> \
+  --gguf_path <path_to_gguf_files> \
+  --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml \
+  --max_new_tokens 1024 \
+  --cache_lens 32768 \
+  --chunk_size 256 \
+  --max_batch_size 4 \
+  --backend_type balance_serve
+```
+
+It features the following arguments:
+
+- `--max_new_tokens`: Maximum number of tokens generated per request.
+- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space.
+- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
+  corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
+- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)
+- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
+
+### 2. access server
+```
+curl -X POST http://localhost:10002/v1/chat/completions \
+  -H "accept: application/json" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "user", "content": "hello"}
+    ],
+    "model": "DeepSeek-R1",
+    "temperature": 0.3,
+    "top_p": 1.0,
+    "stream": true
+  }'
+```
\ No newline at end of file
--- a/doc/en/install.md
+++ b/doc/en/install.md
 <!-- omit in toc -->
+
 # How to Run DeepSeek-R1
- [Preparation](#preparation)
- [Installation](#installation)
-  - [Attention](#attention)
-  - [Supported models include:](#supported-models-include)
-  - [Support quantize format:](#support-quantize-format)

-In this document, we will show you how to install and run KTransformers on your local machine. There are two versions: 
+- [How to Run DeepSeek-R1](#how-to-run-deepseek-r1)
+  - [Preparation](#preparation)
+  - [Installation](#installation)
+    - [Attention](#attention)
+    - [Supported models include](#supported-models-include)
+    - [Support quantize format](#support-quantize-format)
+
+In this document, we will show you how to install and run KTransformers on your local machine. There are two versions:
+
 * V0.2 is the current main branch.
 * V0.3 is a preview version only provides binary distribution for now.
 * To reproduce our DeepSeek-R1/V3 results, please refer to [Deepseek-R1/V3 Tutorial](./DeepseekR1_V3_tutorial.md) for more detail settings after installation.
+
 ## Preparation
+
 Some preparation:

 - CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
-  
+
  ```sh
  # Adding CUDA to PATH
  if [ -d "/usr/local/cuda/bin" ]; then
@@ -32,39 +38,42 @@ Some preparation:
      export CUDA_PATH=$CUDA_PATH:/usr/local/cuda
  fi
  ```
-
 - Linux-x86_64 with gcc, g++ and cmake (using Ubuntu as an example)
-  
+
  ```sh
-  sudo apt-get update
-  sudo apt-get install build-essential cmake ninja-build
+  sudo apt-get update 
+  sudo apt-get install build-essential cmake ninja-build patchelf
  ```
-
 - We recommend using [Miniconda3](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) or [Anaconda3](https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program. Assuming your Anaconda installation directory is `~/anaconda3`, you should ensure that the version identifier of the GNU C++standard library used by Anaconda includes `GLIBCXX-3.4.32`

-  
  ```sh
  conda create --name ktransformers python=3.11
  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
-  
+
  conda install -c conda-forge libstdcxx-ng # Anaconda provides a package called `libstdcxx-ng` that includes a newer version of `libstdc++`, which can be installed via `conda-forge`.

  strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
  ```
-
 - Make sure that PyTorch, packaging, ninja is installed You can also [install previous versions of PyTorch](https://pytorch.org/get-started/previous-versions/)
-  
+
  ```
  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
  pip3 install packaging ninja cpufeature numpy
  ```
-
- - At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.
+- At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.

 ## Installation
+
 ### Attention
+
 If you want to use numa support, not only do you need to set USE_NUMA=1, but you also need to make sure you have installed the libnuma-dev (`sudo apt-get install libnuma-dev` may help you).

+[Optional] If you want to use the multi-concurrent version, please install the following dependencies.
+
+```
+sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libgflags-dev zlib1g-dev libfmt-dev
+```
+
 <!-- 1. ~~Use a Docker image, see [documentation for Docker](./doc/en/Docker.md)~~
   
   >We are working on the latest docker image, please wait for a while.
@@ -78,46 +87,53 @@ If you want to use numa support, not only do you need to set USE_NUMA=1, but you
   
   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced.  -->

-* Download source code and compile:
-   
-   - init source code 
-     
-     ```sh
-     git clone https://github.com/kvcache-ai/ktransformers.git
-     cd ktransformers
-     git submodule init
-     git submodule update
-     ```
-
-   - [Optional] If you want to run with website, please [compile the website](./api/server/website.md) before execute ```bash install.sh```
-
-   - For Linux
-     - For simple install:
-     
-        ```shell
-        bash install.sh
-        ```
-     - For those who have two cpu and 1T RAM:
-
-       ```shell
-        # Make sure your system has dual sockets and double size RAM than the model's size (e.g. 1T RAM for 512G model)
-        apt install libnuma-dev
-        export USE_NUMA=1
-        bash install.sh # or #make dev_install
-        ```
-
-   - For Windows
-     
-     ```shell
-     install.bat
-     ```
-
-* If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./makefile_usage.md) 
+
+Download source code and compile:
+
+  - init source code
+
+    ```sh
+    git clone https://github.com/kvcache-ai/ktransformers.git
+    cd ktransformers
+    git submodule update --init --recursive
+    ```
+  - [Optional] If you want to run with website, please [compile the website](./api/server/website.md) before execute ``bash install.sh``
+  - For Linux
+
+    - For simple install:
+
+      ```shell
+      bash install.sh
+      ```
+    - For those who have two cpu and 1T RAM:
+
+      ```shell
+      # Make sure your system has dual sockets and double size RAM than the model's size (e.g. 1T RAM for 512G model)
+       apt install libnuma-dev
+       export USE_NUMA=1
+       bash install.sh # or #make dev_install
+      ```
+    - For Multi-concurrency with 500G RAM:
+
+      ```shell
+      sudo env USE_BALANCE_SERVE=1 PYTHONPATH="\$(which python)" PATH="\$(dirname \$(which python)):\$PATH" bash ./install.sh
+      ```
+    - For Multi-concurrency with two cpu and 1T RAM:
+
+      ```shell
+      sudo env USE_BALANCE_SERVE=1 USE_NUMA=1 PYTHONPATH="\$(which python)" PATH="\$(dirname \$(which python)):\$PATH" bash ./install.sh
+      ```
+  - For Windows (Windows native temprarily deprecated, please try WSL)
+
+    ```shell
+    install.bat
+    ```
+* If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./makefile_usage.md)

 <h3>Local Chat</h3>
 We provide a simple command-line local chat Python script that you can run for testing.

-> Note: this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). 
+> Note: this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666).

 <h4>Run Example</h4>

@@ -141,57 +157,72 @@ python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Cha
 # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
 # python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
 ```
-
 It features the following arguments:

- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
-  
-  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.

+  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+  >
 - `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
-
 - `--optimize_config_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
-
 - `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
-
 - `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).

+<h3>Start Server</h3>
+We provide a server script, which supports multi-concurrency functionality in version v0.2.4.
+
+```
+python ktransformers/server/main.py --model_path /mnt/data/models/DeepSeek-V3 --gguf_path /mnt/data/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M/ --cpu_infer 62 --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
+```
+It features the following arguments:
+
+- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
+- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
+- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
+- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)
+
 <details>
 <summary>Supported Models/quantization</summary>

-### Supported models include:
-
-| ✅ **Supported Models** | ❌ **Deprecated Models** |
-|------------------------|------------------------|
-| DeepSeek-R1 | ~~InternLM2.5-7B-Chat-1M~~ |
-| DeepSeek-V3 |  |
-| DeepSeek-V2 |  |
-| DeepSeek-V2.5 |  |
-| Qwen2-57B |  |
-| DeepSeek-V2-Lite |  |
-| Mixtral-8x7B |  |
-| Mixtral-8x22B |  |
-
-### Support quantize format:
-
-| ✅ **Supported Formats** | ❌ **Deprecated Formats** |
-|--------------------------|--------------------------|
-| Q2_K_L | ~~IQ2_XXS~~ |
-| Q2_K_XS |  |
-| Q3_K_M |  |
-| Q4_K_M |  |
-| Q5_K_M |  |
-| Q6_K |  |
-| Q8_0 |  |
+### Supported models include
+
+
+| ✅**Supported Models** | ❌**Deprecated Models**    |
+| ---------------------- | -------------------------- |
+| DeepSeek-R1            | ~~InternLM2.5-7B-Chat-1M~~ |
+| DeepSeek-V3            |                            |
+| DeepSeek-V2            |                            |
+| DeepSeek-V2.5          |                            |
+| Qwen2-57B              |                            |
+| DeepSeek-V2-Lite       |                            |
+| Mixtral-8x7B           |                            |
+| Mixtral-8x22B          |                            |
+
+### Support quantize format
+
+
+| ✅**Supported Formats** | ❌**Deprecated Formats** |
+| ----------------------- | ------------------------ |
+| IQ1_S                   | ~~IQ2_XXS~~              |
+| IQ2_XXS                 |                          |
+| Q2_K_L                  |                          |
+| Q2_K_XS                 |                          |
+| Q3_K_M                  |                          |
+| Q4_K_M                  |                          |
+| Q5_K_M                  |                          |
+| Q6_K                    |                          |
+| Q8_0                    |                          |
+
 </details>

 <details>
 <summary>Suggested Model</summary>

+
 | Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
 | ------------------------------ | ---------- | ----- | --------------- | ----------------- |
-| DeepSeek-R1-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V3-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
+| DeepSeek-R1-q4_k_m             | 377G       | 14G   | 382G            | 512G              |
+| DeepSeek-V3-q4_k_m             | 377G       | 14G   | 382G            | 512G              |
 | DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
 | DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
 | DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
@@ -201,12 +232,11 @@ It features the following arguments:
 | Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
 | InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |

-
-More will come soon. Please let us know which models you are most interested in. 
+More will come soon. Please let us know which models you are most interested in.

 Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
-</details>

+</details>

 <details>
  <summary>Click To Show how to run other examples</summary>
@@ -228,9 +258,8 @@ Be aware that you need to be subject to their corresponding model licenses when
  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
  ```
-
 * Deepseek-V2
-  
+
  ```sh
  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
  # Download weights
@@ -250,40 +279,38 @@ Be aware that you need to be subject to their corresponding model licenses when
  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
  ```

-| model name | weights download link |
-|----------|----------|
-| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
-| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
-| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
-| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
-| DeepSeek-R1 | [DeepSeek-R1-gguf-Q4K-M](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M) |
+
+| model name       | weights download link                                                                                                 |
+| ---------------- | --------------------------------------------------------------------------------------------------------------------- |
+| Qwen2-57B        | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main)                       |
+| DeepseekV2-coder | [DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
+| DeepseekV2-chat  | [DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main)                 |
+| DeepseekV2-lite  | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main)                |
+| DeepSeek-R1      | [DeepSeek-R1-gguf-Q4K-M](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M)                |

 </details>

 <!-- pin block for jump -->
-<span id='id_666'> 

-<h3>RESTful API and Web UI  </h3>
+<span id='id_666'>

+<h3>RESTful API and Web UI  </h3>

 Start without website:

 ```sh
 ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
 ```
-
 Start with website:

 ```sh
 ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
 ```
-
 Or you want to start server with transformers, the model_path should include safetensors

 ```bash
 ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
 ```
-
 Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :

 <p align="center">

--- a/doc/zh/DeepseekR1_V3_tutorial_zh.md
+++ b/doc/zh/DeepseekR1_V3_tutorial_zh.md
 <!-- omit in toc -->
+
 # GPT-4/o1 级别本地 VSCode Copilot 在仅 24GB 显存的台式机上的表现
+
 - [摘要](#摘要)
-	- [先决条件](#先决条件)
-	- [基准测试结果](#基准测试结果)
-		- [V0.2](#v02)
-			- [设置](#设置)
-			- [内存占用](#内存占用)
-			- [基准测试结果](#基准测试结果)
-		- [V0.3-Preview](#V0.3-Preview)
-			- [设置](#设置-1)
-			- [内存占用](#内存占用-1)
-			- [基准测试结果](#基准测试结果-1)
-	- [如何运行](#如何运行)
-		- [V0.2 展示](#v02-展示)
-			- [单插槽版本 (32 核心)](#单插槽版本（32 核心）)
-			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）)
-		- [V0.3 展示](#v03-展示)
-			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）-1)
-	- [一些解释](#一些解释)
-	- [常见问题解答](#常见问题解答)
-		- [R1 不思考](#R1 不返回思考过程)
-		- [更多常见问题解答](#更多常见问题解答)
+  - [先决条件](#先决条件)
+  - [基准测试结果](#基准测试结果)
+    - [V0.2](#v02)
+      - [设置](#设置)
+      - [内存占用](#内存占用)
+      - [基准测试结果](#基准测试结果)
+    - [V0.3-Preview](#V0.3-Preview)
+      - [设置](#设置-1)
+      - [内存占用](#内存占用-1)
+      - [基准测试结果](#基准测试结果-1)
+  - [如何运行](#如何运行)
+    - [V0.2 展示](#v02-展示)
+      - [单插槽版本 (32 核心)](#单插槽版本（32 核心）)
+      - [双插槽版本 (64 核心)](#双插槽版本（64 核心）)
+    - [V0.3 展示](#v03-展示)
+      - [双插槽版本 (64 核心)](#双插槽版本（64 核心）-1)
+  - [一些解释](#一些解释)
+  - [常见问题解答](#常见问题解答)
+    - [R1 不思考](#R1 不返回思考过程)
+    - [更多常见问题解答](#更多常见问题解答)

 # 摘要

@@ -37,74 +39,125 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
 </p>

 - **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1:** 仅使用 14GB 显存和 382GB 内存运行其 Q4_K_M 版本。
-	- 预填充(Prefill)速度 (tokens/s): 
- 		- KTransformers: 54.21 (32 核心) → 74.362 (双插槽，2×32 核心) → 255.26 (优化的 AMX 基 MoE 内核，仅 V0.3) → 286.55 (选择性使用 6 个专家，仅 V0.3)  
- 		- 与 llama.cpp 在 2×32 核心下 10.31 tokens/s 相比，速度提升高达 **27.79 倍**
- 	- 解码(Decode)速度 (tokens/s):  
- 		- KTransformers: 8.73 (32 核心) → 11.26 (双插槽， 2×32 核心) → 13.69 (选择性使用 6 个专家，仅 V0.3)  
- 		- 与 llama.cpp 在 2×32 核心下 4.51 tokens/s 相比，速度提升高达 **3.03 倍**
-
-	
-我们还提供了即将推出的优化预览，包括英特尔 AMX 加速内核和选择性专家激活方法，这将显著提升性能。通过 V0.3 预览版，我们在预填充方面实现了高达 286 tokens/s 的速度，比本地推理的 llama.cpp **快 28 倍**。二进制发行版现已可用，源代码即将推出！请查看 wheel 包 [此处](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl) 。
+  - 预填充(Prefill)速度 (tokens/s):
+    - KTransformers: 54.21 (32 核心) → 74.362 (双插槽，2×32 核心) → 255.26 (优化的 AMX 基 MoE 内核，仅 V0.3) → 286.55 (选择性使用 6 个专家，仅 V0.3)
+    - 与 llama.cpp 在 2×32 核心下 10.31 tokens/s 相比，速度提升高达 **27.79 倍**
+  - 解码(Decode)速度 (tokens/s):
+    - KTransformers: 8.73 (32 核心) → 11.26 (双插槽， 2×32 核心) → 13.69 (选择性使用 6 个专家，仅 V0.3)
+    - 与 llama.cpp 在 2×32 核心下 4.51 tokens/s 相比，速度提升高达 **3.03 倍**

+我们还提供了即将推出的优化预览，包括英特尔 AMX 加速内核和选择性专家激活方法，这将显著提升性能。通过 V0.3 预览版，我们在预填充方面实现了高达 286 tokens/s 的速度，比本地推理的 llama.cpp **快 28 倍**。二进制发行版现已可用，源代码即将推出！请查看 wheel 包 [此处](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl) 。

 ## 先决条件
+
 我们在以下配置下进行了最佳性能测试（V0.2）： <br>
 CPU: Intel (R) Xeon (R) Gold 6454S 1T 内存 (2 NUMA 节点) <br>
 GPU: 4090D 24G 显存 <br>
 内存: 标准 DDR5-4800 服务器内存 (1 TB)
+
 ## 基准测试结果
+
 ### V0.2
+
 #### 设置
+
 - Model: DeepseekV3-q4km (int4)<br>
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
 - GPU: 4090D 24G 显存
 - 我们在充分预热后进行测试
+
 #### 内存占用:
-  - 单插槽: 382G 内存，至少 14GB 显存
-  - 双插槽: 1T 内存，至少 14GB 显存
+
+- 单插槽: 382G 内存，至少 14GB 显存
+- 双插槽: 1T 内存，至少 14GB 显存

 #### 基准测试结果

 “6 个专家” 情况是 V0.3 预览版中内容

-| Prompt<br>(500 tokens) | 双插槽 Ktrans (6 个专家) | 双插槽 Ktrans (8 个专家) | Single socket Ktrans (6 个专家) | Single socket Ktrans (8 个专家)| llama.cpp (8 个专家) | 
-|------------------------| --- | --- | --- | --- | --- | 
-| 预填充(Prefill) token/s   | 97.32 | 82.94 | 65.14 | 54.21 | 10.31 |
-| 解码(Decode) token/s     | 13.69 | 12.208 | 10.303 | 8.73 |4.51 |
+
+| Prompt<br>(500 tokens)  | 双插槽 Ktrans (6 个专家) | 双插槽 Ktrans (8 个专家) | Single socket Ktrans (6 个专家) | Single socket Ktrans (8 个专家) | llama.cpp (8 个专家) |
+| ----------------------- | ------------------------ | ------------------------ | ------------------------------- | ------------------------------- | -------------------- |
+| 预填充(Prefill) token/s | 97.32                    | 82.94                    | 65.14                           | 54.21                           | 10.31                |
+| 解码(Decode) token/s    | 13.69                    | 12.208                   | 10.303                          | 8.73                            | 4.51                 |

 **最高加速比在解码方面达到 <u>3.03x</u> 倍，在预填充方面达到 <u>9.44x</u> 倍。**

 ### V0.3-Preview
+
 #### 设置
+
 - Model: DeepseekV3-BF16 (在线量化为 CPU 的 int8 和 GPU 的 int4)
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
 - GPU: (1~4)x 4090D 24G 显存 (更长的 prompt 需要更多显存)

 #### 内存占用:
+
 - 644GB 内存，至少 14GB 显存

 #### 基准测试结果
-| Prompt length  | 1K  | 2K  | 4K  | 8K |
-|---------------|-----|-----|-----|-----|
-| KTrans (8 个专家) Prefill token/s |   185.96  |  255.26   |  252.58   |  195.62   |
-| KTrans (6 个专家) Prefill token/s |   203.70  |  286.55   |  271.08   |  207.20   |
+
+
+| Prompt length                     | 1K     | 2K     | 4K     | 8K     |
+| --------------------------------- | ------ | ------ | ------ | ------ |
+| KTrans (8 个专家) Prefill token/s | 185.96 | 255.26 | 252.58 | 195.62 |
+| KTrans (6 个专家) Prefill token/s | 203.70 | 286.55 | 271.08 | 207.20 |

 **KTrans V0.3 的预填充速度比 KTrans V0.2 快 <u>3.45x</u> 倍，比 llama.cpp 快 <u>27.79x</u> 倍。**
 **解码速度与 KTrans V0.2（6 个专家版本）相同，因此省略。**

-主要加速来自于 
+主要加速来自于
+
 - 英特尔 AMX 指令集和我们专门设计的缓存友好内存布局
 - 专家选择策略，根据离线配置文件结果选择更少的专家

-
 *从我们对 DeepSeekV2、DeepSeekV3 和 DeepSeekR1 的研究中，当我们略微减少推理中的激活专家数量时，输出质量没有变化。但解码和预填充的速度加快了，这令人鼓舞。因此，我们的展示利用了这一发现。*

 ## 如何运行
+
+### 多并发展示
+
+多并发需要额外编译调度器 c++ 代码
+
+```shell
+sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libfmt-dev
+sudo apt-get install libgflags-dev zlib1g-dev patchelf
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule update --init --recursive
+# 如果使用双 numa 版本
+sudo env USE_BALANCE_SERVE=1 USE_NUMA=1 PYTHONzPATH="$(which python)" PATH="$(dirname $(which python)):$PATH" bash ./install.sh
+# 如果使用单 numa 版本
+sudo env USE_BALANCE_SERVE=1 PYTHONzPATH="$(which python)" PATH="$(dirname $(which python)):$PATH" bash ./install.sh
+# 启动命令
+python ktransformers/server/main.py --model_path <your model path> --gguf_path <your gguf path> --cpu_infer 62 --optimize_config_path <inject rule path> --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
+```
+
+`<your model path>` 可以是本地路径，也可以是在线路径，例如 deepseek-ai/DeepSeek-V3。如果在线连接出现问题，可以尝试使用镜像（hf-mirror.com） <br>
+`<your gguf path>` 也可以是在线路径，但由于其体积较大，我们建议您下载并量化模型（注意这是目录路径）
+
+`<inject rule path>` 注入规则 yaml 文件地址，我们在 `ktransformers/optimize/optimize_rules/ ` 目录下提供了 `DeepSeek-V3-Chat-serve.yaml` 和 `DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml` 分别对应 [`DeepSeek-V3/R1-q4km`](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M) 和 [`DeepSeek-V3/R1-hybrid`](https://huggingface.co/KVCache-ai/DeepSeek-R1-GGML-FP8-Hybrid/tree/main)
+
+`--max_new_tokens 1000` 是最大输出 token 长度。如果发现答案被截断，可以增加此数字以获得更长的答案（但要注意内存不足问题，增加此数字会降低生成速度）.
+
+`--chunk_size 256` 引擎单次运行最大 token 个数
+
+`--cache_lens 32768`  调度器申请 kvcache 的总长度。所有请求共享 32768 个 tokens 对应 kvcache 空间，请求完成后会释放其所占用的 kvcache 空间。
+
+`--backend_type balance_serve` `balance_serve`是 v0.2.4新增的后端引擎，原本的单并发引擎为`ktransformers`
+
+`--max_batch_size 4` 引擎单次运行最多处理 4 个请求(prefill + decode),(仅用于`balance_serve`)
+
+<br>命令 numactl -N 1 -m 1 的目的是避免 NUMA 节点之间的数据传输<br>
+注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think`，这在 [常见问题解答](#常见问题解答) 部分中解释。
+
 ### V0.2 展示
+
 #### 单插槽版本（32 核心）
+
 我们的 local_chat 测试命令是:
-``` shell
+
+```shell
 git clone https://github.com/kvcache-ai/ktransformers.git
 cd ktransformers
 git submodule init
@@ -112,17 +165,13 @@ git submodule update
 numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
 <当您看到聊天时，按回车键加载文本提示文件>
 ```
-`<your model path>` 可以是本地路径，也可以是在线路径，例如 deepseek-ai/DeepSeek-V3。如果在线连接出现问题，可以尝试使用镜像（hf-mirror.com） <br>
-`<your gguf path>` 也可以是在线路径，但由于其体积较大，我们建议您下载并量化模型（注意这是目录路径） <br>
-`--max_new_tokens 1000` 是最大输出 token 长度。如果发现答案被截断，可以增加此数字以获得更长的答案（但要注意内存不足问题，增加此数字会降低生成速度）. 
-<br>
-命令 numactl -N 1 -m 1 的目的是避免 NUMA 节点之间的数据传输<br>
-注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`，这在 [常见问题解答](#常见问题解答) 部分中解释。

 #### 双插槽版本（64 核心）
+
 在安装之前（使用 install.sh 或 `make dev_install`），请确保设置环境变量 `USE_NUMA=1`，方法是 `export USE_NUMA=1`（如果已经安装，请重新安装并设置此环境变量） <br>
 我们的 local_chat 测试命令是：
-``` shell
+
+```shell
 git clone https://github.com/kvcache-ai/ktransformers.git
 cd ktransformers
 git submodule init
@@ -132,42 +181,48 @@ make dev_install # or sh ./install.sh
 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <当您看到聊天时，按回车键加载文本提示文件>
 ```
+
 参数的含义相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。

 ### V0.3 展示
+
 #### 双插槽版本（64 核心）
+
 我们的 local_chat 测试命令是：
-``` shell
+
+```shell
 wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <当您看到聊天时，按回车键加载文本提示文件>
 ```
+
 参数的含义与 V0.2 相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。

 ## 一些解释
-1. 我们还想进一步利用 Xeon Gold CPU 上的两个 NUMA 节点。为了避免节点之间的数据传输成本，我们在两个节点上 "copy" 了关键矩阵，这会增加内存占用，但会加速预填充和解码过程。但这种方法占用大量内存，加载权重时速度较慢，因此加载时请耐心等待并监控内存使用情况。我们计划优化这一巨大的内存开销。敬请期待。

+1. 我们还想进一步利用 Xeon Gold CPU 上的两个 NUMA 节点。为了避免节点之间的数据传输成本，我们在两个节点上 "copy" 了关键矩阵，这会增加内存占用，但会加速预填充和解码过程。但这种方法占用大量内存，加载权重时速度较慢，因此加载时请耐心等待并监控内存使用情况。我们计划优化这一巨大的内存开销。敬请期待。
 2. 命令参数 `--cpu_infer 65` 指定使用多少核心（超过物理核心数量是可以的，但并不是越多越好。根据实际核心数量适当降低此值）。<br>
-
 3. 为什么使用 CPU/GPU 混合推理？
-DeepSeek 的 MLA 操作符计算密集。虽然全部在 CPU 上运行是可行的，但将繁重的计算任务卸载到 GPU 上能带来巨大的性能提升。
-
+   DeepSeek 的 MLA 操作符计算密集。虽然全部在 CPU 上运行是可行的，但将繁重的计算任务卸载到 GPU 上能带来巨大的性能提升。
 4. 加速来自哪里？

   - 专家卸载：与传统的基于层或 KVCache 卸载（如 llama.cpp 中的）不同，我们将专家计算卸载到 CPU，将 MLA/KVCache 卸载到 GPU，与 DeepSeek 的架构完美对齐，实现最佳效率。
-   - 英特尔 AMX 优化 – 我们的 AMX 加速内核经过精心调优，运行速度是现有 llama.cpp 实现的数倍。我们计划在清理后开源此内核，并考虑向 llama.cpp 上游贡献代码。 
-
+   - 英特尔 AMX 优化 – 我们的 AMX 加速内核经过精心调优，运行速度是现有 llama.cpp 实现的数倍。我们计划在清理后开源此内核，并考虑向 llama.cpp 上游贡献代码。
 5. 为什么选择英特尔 CPU？
-英特尔目前是唯一支持 AMX 类似指令的 CPU 供应商，与仅支持 AVX 的替代方案相比，性能显著更好。
+   英特尔目前是唯一支持 AMX 类似指令的 CPU 供应商，与仅支持 AVX 的替代方案相比，性能显著更好。

 ## 常见问题解答
+
 ### R1 不返回思考过程
+
 注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`。详细信息在 [常见问题解答](./FAQ.md) 部分中。 <br>

 ## 问题
+
 * 修复服务器集成功能以实现网络API访问支持
 * 修复本地聊天功能仅支持单行提示输入的问题（目前输入换行符(\n)即开始生成提示）

 ### 更多常见问题解答
+
 [详见](./FAQ.md)
--- a/install.sh
+++ b/install.sh
@@ -4,14 +4,23 @@ set -e
 # clear build dirs
 rm -rf build
 rm -rf *.egg-info
-rm -rf ktransformers/ktransformers_ext/build
-rm -rf ktransformers/ktransformers_ext/cuda/build
-rm -rf ktransformers/ktransformers_ext/cuda/dist
-rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
-
+rm -rf csrc/build
+rm -rf csrc/ktransformers_ext/build
+rm -rf csrc/ktransformers_ext/cuda/build
+rm -rf csrc/ktransformers_ext/cuda/dist
+rm -rf csrc/ktransformers_ext/cuda/*.egg-info
+rm -rf ~/.ktransformers
 echo "Installing python dependencies from requirements.txt"
 pip install -r requirements-local_chat.txt
-
+pip install -r ktransformers/server/requirements.txt
 echo "Installing ktransformers"
-KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation
+KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation
+pip install third_party/custom_flashinfer/
+
+SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+echo "Copying thirdparty libs to $SITE_PACKAGES"
+cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
+patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
+
+
 echo "Installation completed successfully"
\ No newline at end of file
--- a/ktransformers/__init__.py
+++ b/ktransformers/__init__.py
@@ -8,4 +8,4 @@ Version      : 1.0.0
 LastEditors  : chenxl 
 LastEditTime : 2025-02-15 03:53:02
 '''
-__version__ = "0.2.3post2"
+__version__ = "0.2.4"
--- a/ktransformers/configs/config.yaml
+++ b/ktransformers/configs/config.yaml
@@ -21,7 +21,8 @@ user:

 model:
  # type: transformers
-  type: ktransformers
+  type: balance_serve
+  # type: ktransformers

  name: DeepSeek-Coder-V2-Instruct
  path: deepseek-ai/DeepSeek-V2-Lite-Chat
@@ -29,7 +30,7 @@ model:

  device: cuda:0
  cache_lens: 8192
-
+  max_new_tokens: 500
 web:
  mount: False
  open_cross_domain: True
@@ -38,7 +39,6 @@ ext:
  cpu_infer: 10

 long_context:
-  chunk_size: 4096
  max_seq_len: 32000
  block_size: 128
  local_windows_len: 4096
@@ -54,4 +54,19 @@ long_context:
  token_step: 

 local_chat:
-  prompt_file: ""
\ No newline at end of file
+  prompt_file: ""
+
+async_server:
+  sched_strategy: "FCFS"
+  sched_port: 56441
+  sched_metrics_port: 54321
+  kvc2_metrics_port: 54391
+  max_batch_size: 4  # decode count + prefill count, in one mini batch
+
+attn:
+  page_size: 256
+  chunk_size: 256
+kvc2:
+  gpu_only: true 
+  utilization_percentage: 1.0
+  cpu_memory_size_GB: 500
--- a/ktransformers/configs/model_configs.json
+++ b/ktransformers/configs/model_configs.json
+{
+    "DeepSeek-Coder-V2-Instruct": {
+        "hidden_size": 5120,
+        "intermediate_size": 12288,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v2",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 60,
+        "num_key_value_heads": 128,
+        "vocab_size": 102400
+    },
+    "DeepSeek-R1": {
+        "hidden_size": 7168,
+        "intermediate_size": 18432,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 61,
+        "num_key_value_heads": 128,
+        "vocab_size": 129280
+    },
+    "DeepSeek-V2-Lite-Chat": {
+        "hidden_size": 2048,
+        "intermediate_size": 10944,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v2",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "num_key_value_heads": 16,
+        "vocab_size": 102400
+    },
+    "DeepSeek-V3": {
+        "hidden_size": 7168,
+        "intermediate_size": 18432,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 3,
+        "num_key_value_heads": 128,
+        "vocab_size": 129280
+    },
+    "DeepSeek-V3-bf16": {
+        "hidden_size": 7168,
+        "intermediate_size": 18432,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 61,
+        "num_key_value_heads": 128,
+        "vocab_size": 129280
+    },
+    "LLaMA-2-7B-32K": {
+        "hidden_size": 4096,
+        "intermediate_size": 11008,
+        "max_position_embeddings": 32768,
+        "model_type": "llama",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 32,
+        "vocab_size": 32000
+    },
+    "Moonlight-16B-A3B-Instruct": {
+        "hidden_size": 2048,
+        "intermediate_size": 11264,
+        "max_position_embeddings": 8192,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "num_key_value_heads": 16,
+        "vocab_size": 163840
+    },
+    "Qwen2.5-32B-Instruct": {
+        "hidden_size": 5120,
+        "intermediate_size": 27648,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 40,
+        "num_hidden_layers": 64,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    },
+    "Qwen2.5-32B-Instruct-GPTQ-Int4": {
+        "hidden_size": 5120,
+        "intermediate_size": 27648,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 40,
+        "num_hidden_layers": 64,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    },
+    "Qwen2.5-7B-Instruct": {
+        "hidden_size": 3584,
+        "intermediate_size": 18944,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 28,
+        "num_hidden_layers": 28,
+        "num_key_value_heads": 4,
+        "vocab_size": 152064
+    },
+    "Qwen2.5-7B-Instruct-GPTQ-Int4": {
+        "hidden_size": 3584,
+        "intermediate_size": 18944,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 28,
+        "num_hidden_layers": 28,
+        "num_key_value_heads": 4,
+        "vocab_size": 152064
+    },
+    "qwen2-72b-instruct": {
+        "hidden_size": 8192,
+        "intermediate_size": 29568,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 64,
+        "num_hidden_layers": 80,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    }
+}
\ No newline at end of file
--- a/ktransformers/configs/quant_configs.json
+++ b/ktransformers/configs/quant_configs.json
+{
+    "BF16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "BF16",
+        "reference": "",
+        "type_of_dot_vector": "BF16"
+    },
+    "FP16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP16",
+        "reference": "",
+        "type_of_dot_vector": "FP16"
+    },
+    "FP32": {
+        "block_element_count": 1,
+        "block_element_size": 4,
+        "bytes_per_element": 4.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP32",
+        "reference": "",
+        "type_of_dot_vector": "FP32"
+    },
+    "Q4_0": {
+        "block_element_count": 32,
+        "block_element_size": 18,
+        "bytes_per_element": 0.5625,
+        "can_be_used_as_vector": false,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q4_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    },
+    "Q8_0": {
+        "block_element_count": 32,
+        "block_element_size": 34,
+        "bytes_per_element": 1.0625,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q8_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    }
+}
\ No newline at end of file