add balance-serve, support concurrence

25cee581 · Atream · 8d0292aa · 25cee581 · 25cee581 · 25cee581
Commit 25cee581 authored Mar 31, 2025 by Atream
20 changed files
--- a/csrc/ktransformers_ext/cuda/custom_gguf/ops.h
+++ b/csrc/ktransformers_ext/cuda/custom_gguf/ops.h
@@ -19,4 +19,4 @@ torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int
 torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
 torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
 torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
-torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
\ No newline at end of file
--- a/csrc/ktransformers_ext/ext_bindings.cpp
+++ b/csrc/ktransformers_ext/ext_bindings.cpp
@@ -9,6 +9,7 @@
 **/
 // Python bindings
 #include "cpu_backend/cpuinfer.h"
+#include "device_launch_parameters.h"
 #include "llamafile/flags.h"
 #include "operators/kvcache/kvcache.h"
 #include "operators/llamafile/linear.h"
@@ -535,16 +536,17 @@ class MOEBindings {
            const float *weights;
            const void *input;
            void *output;
+            int *batch_size_tensor;
        };
        static void inner(void *args) {
            Args *args_ = (Args *)args;
            args_->cpuinfer->enqueue(
                &MOE::forward, args_->moe, args_->qlen, args_->k,
-                args_->expert_ids, args_->weights, args_->input, args_->output);
+                args_->expert_ids, args_->weights, args_->input, args_->output, args_->batch_size_tensor);
        }
        static std::pair<intptr_t, intptr_t>
        cpuinfer_interface(MOE &moe, int qlen, int k, intptr_t expert_ids,
-                           intptr_t weights, intptr_t input, intptr_t output) {
+                           intptr_t weights, intptr_t input, intptr_t output, intptr_t batch_size_tensor) {
            Args *args = new Args{nullptr,
                                  &moe,
                                  qlen,
@@ -552,7 +554,8 @@ class MOEBindings {
                                  (const uint64_t *)expert_ids,
                                  (const float *)weights,
                                  (const void *)input,
-                                  (void *)output};
+                                  (void *)output,
+                                  (int *)batch_size_tensor};
            return std::make_pair((intptr_t)&inner, (intptr_t)args);
        }
    };
@@ -679,4 +682,4 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
                 cpuinfer_interface)
        .def("calc_anchor_all_layers",
             &KVCacheBindings::CalcAnchorAllLayersBindinds::cpuinfer_interface);
-}
+}
\ No newline at end of file
--- a/csrc/ktransformers_ext/operators/llamafile/moe.cpp
+++ b/csrc/ktransformers_ext/operators/llamafile/moe.cpp
@@ -341,7 +341,8 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
    }, nullptr);
 }

-void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
+void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend) {
+    qlen = batch_size_tensor[0];
    if (qlen < config_.group_min_len) {
        for (int i = 0; i < qlen; i++) {
            forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
@@ -350,5 +351,7 @@ void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weig
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    forward_many(forward_len, k, expert_ids, weights, input, output, backend);
-    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+
+    batch_size_tensor[0] -= forward_len;
+    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), batch_size_tensor, backend);
 }
\ No newline at end of file
--- a/csrc/ktransformers_ext/operators/llamafile/moe.h
+++ b/csrc/ktransformers_ext/operators/llamafile/moe.h
@@ -53,7 +53,7 @@ class MOE {
    void warm_up(Backend* backend);
    void forward_one(int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
    void forward_many(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
-    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
+    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend);

   private:
    MOEConfig config_;

--- a/doc/README.md
+++ b/doc/README.md
@@ -22,13 +22,14 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin

 <h2 id="Updates">🔥 Updates</h2>

+* **Mar 27, 2025**: Support Multi-concurrency.
 * **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./en/ROCm.md)).
 * **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./en/fp8_kernel.md) weights. Support 139K [Longer Context](./en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
 * **Feb 25, 2025**: Support [FP8 GPU kernel](./en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
-* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU. 
-* **Aug 14, 2024**: Support llamfile as linear backend. 
+* **Aug 15, 2024**: Update detailed [TUTORIAL](./en/injection_tutorial.md) for injection and multi-GPU.
+* **Aug 14, 2024**: Support llamfile as linear backend.
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
--- a/doc/SUMMARY.md
+++ b/doc/SUMMARY.md
@@ -23,4 +23,28 @@
 # V3 Reproduction
 - [Success List](en/V3-success.md)
 # Benchmark
- [Benchmark](en/benchmark.md)
\ No newline at end of file
+- [Benchmark](# Ktransformer
+
+[Introduction](./README.md)
+# Install
+- [Installation Guide](en/install.md)
+
+# Tutorial 
+- [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
+- [Why KTransformers So Fast](en/deepseek-v2-injection.md)
+- [Injection Tutorial](en/injection_tutorial.md)
+- [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
+- [Use FP8 GPU Kernel](en/fp8_kernel.md)
+# Server
+  - [Server](en/api/server/server.md)
+  - [Website](en/api/server/website.md)
+  - [Tabby](en/api/server/tabby.md)
+# For Developer
+- [Makefile Usage](en/makefile_usage.md)
+
+# FAQ
+- [FAQ](en/FAQ.md)
+# V3 Reproduction
+- [Success List](en/V3-success.md)
+# Benchmark
+- [Benchmark](
\ No newline at end of file
--- a/doc/en/DeepseekR1_V3_tutorial.md
+++ b/doc/en/DeepseekR1_V3_tutorial.md
 <!-- omit in toc -->
+
 # GPT-4/o1-level Local VSCode Copilot on a Desktop with only 24GB VRAM
+
 - [SUMMARY](#summary)
-	- [Show Case Environment](#show-case-environment)
-	- [Bench Result](#bench-result)
-		- [V0.2.1](#v021)
-			- [Memory consumption:](#memory-consumption)
-			- [Change Log](#change-log)
-			- [Benchmark Results](#benchmark-results)
-		- [V0.2](#v02)
-			- [Settings](#settings)
-			- [Memory consumption:](#memory-consumption-1)
-			- [Benchmark Results](#benchmark-results-1)
-		- [V0.3-Preview](#v03-preview)
-			- [Settings](#settings-1)
-			- [Memory consumptions:](#memory-consumptions)
-			- [Benchmark results](#benchmark-results-2)
-	- [How to Run](#how-to-run)
-		- [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
-			- [longer context](#longer-context)
-			- [FP8 kernel](#fp8-kernel)
-		- [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
-			- [Single socket version (32 cores)](#single-socket-version-32-cores)
-			- [Dual socket version (64 cores)](#dual-socket-version-64-cores)
-		- [V0.3 Showcase](#v03-showcase)
-			- [Dual socket version (64 cores)](#dual-socket-version-64-cores-1)
-	- [Some Explanations](#some-explanations)
-	- [Next](#next)
-		- [Faster](#faster)
-		- [Easier](#easier)
-	- [FAQ](#faq)
-		- [R1 No Thinking](#r1-no-thinking)
-		- [More FAQ](#more-faq)
+  - [Show Case Environment](#show-case-environment)
+  - [Bench Result](#bench-result)
+    - [V0.2.1](#v021)
+      - [Memory consumption:](#memory-consumption)
+      - [Change Log](#change-log)
+      - [Benchmark Results](#benchmark-results)
+    - [V0.2](#v02)
+      - [Settings](#settings)
+      - [Memory consumption:](#memory-consumption-1)
+      - [Benchmark Results](#benchmark-results-1)
+    - [V0.3-Preview](#v03-preview)
+      - [Settings](#settings-1)
+      - [Memory consumptions:](#memory-consumptions)
+      - [Benchmark results](#benchmark-results-2)
+  - [How to Run](#how-to-run)
+    - [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
+      - [longer context](#longer-context)
+      - [FP8 kernel](#fp8-kernel)
+    - [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
+      - [Single socket version (32 cores)](#single-socket-version-32-cores)
+      - [Dual socket version (64 cores)](#dual-socket-version-64-cores)
+    - [V0.3 Showcase](#v03-showcase)
+      - [Dual socket version (64 cores)](#dual-socket-version-64-cores-1)
+  - [Some Explanations](#some-explanations)
+  - [Next](#next)
+    - [Faster](#faster)
+    - [Easier](#easier)
+  - [FAQ](#faq)
+    - [R1 No Thinking](#r1-no-thinking)
+    - [More FAQ](#more-faq)

 # SUMMARY

 > **Feb 10, 2025**: Support DeepseekR1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup.<br>

-Hi, we're the KTransformers team (formerly known for our local CPU/GPU hybrid inference open source project with DeepSeek-V2).  
+Hi, we're the KTransformers team (formerly known for our local CPU/GPU hybrid inference open source project with DeepSeek-V2).

-We've heard your requests for DeepSeek-R1/V3 support—and we're excited to finally deliver! 
+We've heard your requests for DeepSeek-R1/V3 support—and we're excited to finally deliver!
 Apologies for the wait, but we've been cooking up something truly amazing!

-Today, we're proud to announce that we not only support DeepSeek-R1/V3, as showcased in the video below:  
+Today, we're proud to announce that we not only support DeepSeek-R1/V3, as showcased in the video below:

 https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285

 </p>

 - **[NEW!!!] Local 671B DeepSeek-Coder-V3/R1:** Running its Q4_K_M version using only 14GB VRAM and 382GB DRAM.
-	- Prefill Speed (tokens/s): 
- 		- KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.  
- 	- Decode Speed (tokens/s):  
- 		- KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)  
- 		- Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.  
-  
+  - Prefill Speed (tokens/s):
+    - KTransformers: 54.21 (32 cores) → 74.362 (dual-socket, 2×32 cores) → 255.26 (optimized AMX-based MoE kernel, V0.3 only) → 286.55 (selectively using 6 experts, V0.3 only)
+    - Compared to 10.31 tokens/s in llama.cpp with 2×32 cores, achieving up to **27.79× speedup**.
+  - Decode Speed (tokens/s):
+    - KTransformers: 8.73 (32 cores) → 11.26 (dual-socket, 2×32 cores) → 13.69 (selectively using 6 experts, V0.3 only)
+    - Compared to 4.51 tokens/s in llama.cpp with 2×32 cores, achieving up to **3.03× speedup**.

 We also give our upcoming optimizations previews, including an Intel AMX-accelerated kernel and a selective expert activation method, which will significantly enhance performance. With V0.3-preview, we achieve up to 286 tokens/s for prefill, making it up to **28× faster than llama.cpp** for local inference.
-The binary distribution is available now and the source code will come ASAP! Check out the wheel package [here](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl)  
+The binary distribution is available now and the source code will come ASAP! Check out the wheel package [here](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl)

 > **Feb 15, 2025**: KTransformers V0.2.1: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%) (Up to 16 Tokens/s), update docs [here](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).

 We speed up the decode and prefill speed a littlt bit. The reason for the limited performance improvement mainly lies in the fact that the inference process is still constrained by the CPU's computational speed and memory bandwidth. The MLA part handled by the GPU accounts for a relatively small proportion.

 Besides the improvements in speed, we've also significantly updated the documentation to enhance usability, including:<br>
+
 - Added Multi-GPU configuration tutorial.
 - Consolidated installation guide.
 - Add a detailed tutorial on registering extra GPU memory with ExpertMarlin;

-
 ## Show Case Environment
+
 We run our best performance tests (V0.2) on <br>
 CPU: Intel (R) Xeon (R) Gold 6454S 1T DRAM (2 NUMA nodes) <br>
 GPU: 4090D 24G VRAM <br>
 Memory: standard DDR5-4800 server DRAM (1 TB), each socket with 8×DDR5-4800
+
 ## Bench Result
+
 ### V0.2.1
+
 - Model: DeepseekV3-q4km (int4)<br>
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
 - GPU: 4090 24G VRAM
 - We test after enough warm up
+
 #### Memory consumption:
-  - Single socket: 382G DRAM, at least 14GB VRAM
-  - Dual socket: 1T DRAM, at least 14GB VRAM
+
+- Single socket: 382G DRAM, at least 14GB VRAM
+- Dual socket: 1T DRAM, at least 14GB VRAM
+
 #### Change Log
+
 - Longer Context (from 4K to 8K for 24GB VRAM) and Slightly Faster Speed （+15%):<br>
-Integrated the highly efficient Triton MLA Kernel from the fantastic sglang project, enable much longer context length and slightly faster prefill/decode speed
+  Integrated the highly efficient Triton MLA Kernel from the fantastic sglang project, enable much longer context length and slightly faster prefill/decode speed
 - We suspect that some of the improvements come from the change of hardware platform (4090D->4090)
-#### Benchmark Results

+#### Benchmark Results

 "6 experts" case is part of V0.3's preview


-| Prompt | hi (2) | 1K (969) | 2K (1930) | 4K (3846) | 8K (7678) | 
-| --- | --- | --- | --- | --- | --- | 
-| Output length | 10tokens | 300tokens | 300tokens | 300tokens | 300tokens | 
-| **6 experts V0.2.0** |  |  |  |  |  |
-| Prefill token/s | 13 | 105 | 102 | 88 | CUDA OOM |
-| decode token/s | 16.8 | 15.4 | 14.2 | 13.0 | CUDA OOM |
-| **6 experts V0.2.1** |   |   |   |   |   |
-| Prefill token/s | 13 | 111 | 112.5 | 102 **(1.16x speedup)** | 101 |
-| decode token/s | 16.8 | 15.9 | 15.4 | 14.9 **(1.15x speedup)** | 13.9 |
-| **8 experts V0.2.1** |   |   |   |   |   |
-| Prefill token/s | 12.2 | 88.2 | 88.5 | 81.9 | 80 |
-| Decode token/s | 13.4 | 13.5 | 13.4 | 13.2 | 12.4 |
-
+| Prompt               | hi (2)   | 1K (969)  | 2K (1930) | 4K (3846)               | 8K (7678) |
+| -------------------- | -------- | --------- | --------- | ----------------------- | --------- |
+| Output length        | 10tokens | 300tokens | 300tokens | 300tokens               | 300tokens |
+| **6 experts V0.2.0** |          |           |           |                         |           |
+| Prefill token/s      | 13       | 105       | 102       | 88                      | CUDA OOM  |
+| decode token/s       | 16.8     | 15.4      | 14.2      | 13.0                    | CUDA OOM  |
+| **6 experts V0.2.1** |          |           |           |                         |           |
+| Prefill token/s      | 13       | 111       | 112.5     | 102**(1.16x speedup)**  | 101       |
+| decode token/s       | 16.8     | 15.9      | 15.4      | 14.9**(1.15x speedup)** | 13.9      |
+| **8 experts V0.2.1** |          |           |           |                         |           |
+| Prefill token/s      | 12.2     | 88.2      | 88.5      | 81.9                    | 80        |
+| Decode token/s       | 13.4     | 13.5      | 13.4      | 13.2                    | 12.4      |

 ### V0.2
+
 #### Settings
+
 - Model: DeepseekV3-q4km (int4)<br>
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 sockets, 2 numa nodes
 - GPU: 4090D 24G VRAM
 - We test after enough warm up
+
 #### Memory consumption:
-  - Single socket: 382G DRAM, at least 14GB VRAM
-  - Dual socket: 1T DRAM, at least 14GB VRAM
+
+- Single socket: 382G DRAM, at least 14GB VRAM
+- Dual socket: 1T DRAM, at least 14GB VRAM

 #### Benchmark Results

 "6 experts" case is part of V0.3's preview

-| Prompt<br>(500 tokens) | Dual socket Ktrans (6 experts) | Dual socket Ktrans (8 experts) | Single socket Ktrans (6 experts) | Single socket Ktrans (8 experts)| llama.cpp (8 experts) | 
-| --- | --- | --- | --- | --- | --- | 
-| Prefill token/s | 97.32 | 82.94 | 65.14 | 54.21 | 10.31 |
-| Decode token/s | 13.69 | 12.208 | 10.303 | 8.73 |4.51 |
+
+| Prompt<br>(500 tokens) | Dual socket Ktrans (6 experts) | Dual socket Ktrans (8 experts) | Single socket Ktrans (6 experts) | Single socket Ktrans (8 experts) | llama.cpp (8 experts) |
+| ---------------------- | ------------------------------ | ------------------------------ | -------------------------------- | -------------------------------- | --------------------- |
+| Prefill token/s        | 97.32                          | 82.94                          | 65.14                            | 54.21                            | 10.31                 |
+| Decode token/s         | 13.69                          | 12.208                         | 10.303                           | 8.73                             | 4.51                  |

 **The highest speedup reaches up to <u>3.03x</u> in decoding and <u>9.44x</u> in prefill.**

 ### V0.3-Preview
+
 #### Settings
+
 - Model: DeepseekV3-BF16 (online quant into int8 for CPU and int4 for GPU)
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S, 32 cores per socket, 2 socket, 2 numa nodes
 - GPU: (1~4)x 4090D 24GVRAM (requires more VRAM for longer prompt)

 #### Memory consumptions:
+
 - 644GB DRAM, at least 14GB VRAM

 #### Benchmark results
-| Prompt length  | 1K  | 2K  | 4K  | 8K |
-|---------------|-----|-----|-----|-----|
-| KTrans (8 experts) Prefill token/s |   185.96  |  255.26   |  252.58   |  195.62   |
-| KTrans (6 experts) Prefill token/s |   203.70  |  286.55   |  271.08   |  207.20   |
+
+
+| Prompt length                      | 1K     | 2K     | 4K     | 8K     |
+| ---------------------------------- | ------ | ------ | ------ | ------ |
+| KTrans (8 experts) Prefill token/s | 185.96 | 255.26 | 252.58 | 195.62 |
+| KTrans (6 experts) Prefill token/s | 203.70 | 286.55 | 271.08 | 207.20 |

 **The prefill of KTrans V0.3 is up to <u>3.45x</u> times faster than KTrans V0.2, and is up to <u>27.79x</u> times faster than llama.cpp.**
 **The decoding speed is the same as KTrans V0.2 (6 experts version) so it is omitted**

-The main acceleration comes from 
+The main acceleration comes from
+
 - Intel AMX instruction set and our specially designed cache friendly memory layout
 - Expert selection strategy that selects fewer experts based on offline profile results of out of domain data

-
-*From our research on DeepSeekV2, DeepSeekV3 and DeepSeekR1, 
-when we slightly decrease the activation experts num in inference, 
-the output quality doesn't change. But the speed of decoding and prefill 
+*From our research on DeepSeekV2, DeepSeekV3 and DeepSeekR1,
+when we slightly decrease the activation experts num in inference,
+the output quality doesn't change. But the speed of decoding and prefill
 is speed up which is inspiring. So our showcase makes use of this finding*

 ## How to Run
+
+### v0.2.4 
+We provide a server script, which supports multi-concurrency functionality in version v0.2.4.
+
+```
+python ktransformers/server/main.py --model_path /mnt/data/models/DeepSeek-V3 --gguf_path /mnt/data/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M/ --cpu_infer 62 --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
+```
+It features the following arguments:
+
+- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
+- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
+- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
+- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)
+
 ### v0.2.2 & v0.2.3 longer context & FP8 kernel
+
 #### longer context
+
 To use this feature, [install flashinfer](https://github.com/flashinfer-ai/flashinfer) first.

 Note: The latest MLA kernel in FlashInfer still has a few minor issues. They are continuously fixing them on the main branch. If you are using FlashInfer, please install it from the main source code.

 If you want to use long context(longer than 20K) for prefill, enable the matrix absorption MLA during the prefill phase, which will significantly reduce the size of the kv cache. Modify yaml file like this:
+
 ```
 - match:
    name: "^model\\.layers\\..*\\.self_attn$"
@@ -175,10 +210,12 @@ If you want to use long context(longer than 20K) for prefill, enable the matrix
      absorb_for_prefill: True # change this to True to enable long context(prefill may slower).
 ```

-If the VRAM is still insufficient, try reducing the `chunk_prefill_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.
+If the VRAM is still insufficient, try reducing the `chunk_size` parameter (default is 8192) to further decrease the intermediate results during chunk prefill.
+
 #### FP8 kernel

 The DeepSeek-AI team provides FP8 safetensors for DeepSeek-R1/V3 models. We achieve performance optimization through the following works:
+
 - **FP8 GPU Kernel Integration**: FP8 linear layer acceleration kernels integrated in KTransformers
 - **Hybrid Quantization Architecture**:
  - Attention and Shared-Expert modules use FP8 precision (enhances computational accuracy)
@@ -189,16 +226,20 @@ So those who are persuing the best performance can use the FP8 linear kernel for
 The detailed guide is [here](./fp8_kernel.md).

 ### V0.2 & V0.2.1 Showcase
+
 #### Single socket version (32 cores)
+
 Our local_chat test command is:
-``` shell
+
+```shell
 numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
-`<your model path>` can be local or set from online hugging face like deepseek-ai/DeepSeek-V3. If online encounters connection problem, try use mirror (hf-mirror.com) <br>
+
+`<your model path>` can be local or set from online huggingface like deepseek-ai/DeepSeek-V3. If online encounters connection problem, try use mirror (hf-mirror.com) <br>
 `<your gguf path>` can also be online, but as its large we recommend you download it and quantize the model to what you want (notice it's the dir path) <br>
 `--max_new_tokens 1000` is the max output token length. If you find the answer is truncated, you
-can increase the number for longer answer (But be aware of OOM, and increase it will slow down the generation rate.). 
+can increase the number for longer answer (But be aware of OOM, and increase it will slow down the generation rate.).

 The command `numactl -N 1 -m 1` aims to advoid data transfer between numa nodes<br>
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. This is explained in [FAQ](#faq) part
@@ -208,7 +249,8 @@ Attention! If you are testing R1 and it may skip thinking. So you can add arg: `
 Make sure before you install (use install.sh or `make dev_install`), setting the env var `USE_NUMA=1` by `export USE_NUMA=1` (if already installed, reinstall it with this env var set). You may check the doc [here](./install.md) for install details. <br>

 Test Command:
-``` shell
+
+```shell
 # ---For those who have not installed ktransformers---
 # git clone https://github.com/kvcache-ai/ktransformers.git
 # cd ktransformers
@@ -220,53 +262,65 @@ Test Command:
 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
+
 The parameters' meaning is the same. But As we use dual socket, we set cpu_infer to 65

 ### V0.3 Showcase
+
 #### Dual socket version (64 cores)
+
 Our local_chat test command is:
-``` shell
+
+```shell
 wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <when you see chat, then press enter to load the text prompt_file>
 ```
+
 The parameters' meaning is the same with V0.2. But As we  use dual socket, we set cpu_infer to 65

 ## Some Explanations
-1. Also we want to make further use of our two NUMA nodes on Xeon Gold cpu. 
-To avoid the cost of data transfer between nodes, we "copy" the critical matrix on 
-both nodes which takes more memory consumption but accelerates the prefill and decoding process.
-But this method takes huge memory and slow when loading weights, So be patient when loading
-and monitor the memory usage. We are going to optimize this huge memory overhead. Stay tuned~ <br>
-2. The command args `--cpu_infer 65` specifies how many cores to use (it's ok that it exceeds the physical number, 
-but it's not the more the better. Adjust it slightly lower to your actual number of cores)<br>

+1. Also we want to make further use of our two NUMA nodes on Xeon Gold cpu.
+   To avoid the cost of data transfer between nodes, we "copy" the critical matrix on
+   both nodes which takes more memory consumption but accelerates the prefill and decoding process.
+   But this method takes huge memory and slow when loading weights, So be patient when loading
+   and monitor the memory usage. We are going to optimize this huge memory overhead. Stay tuned~ <br>
+2. The command args `--cpu_infer 65` specifies how many cores to use (it's ok that it exceeds the physical number,
+   but it's not the more the better. Adjust it slightly lower to your actual number of cores)<br>
 3. Why CPU/GPU Hybrid Inference?
-DeepSeek's MLA operators are highly computationally intensive. While running everything on CPU is possible, offloading the heavy computations to the GPU results in a massive performance boost.  
-
+   DeepSeek's MLA operators are highly computationally intensive. While running everything on CPU is possible, offloading the heavy computations to the GPU results in a massive performance boost.
 4. Where Does the Speedup Come From?

-   - Expert Offload: Unlike traditional layer-based or KVCache offloading (as seen in llama.cpp), we offload the expert computation to the CPU and MLA/KVCache to GPU, aligning perfectly with DeepSeek’s architecture for optimal efficiency.  
-   - Intel AMX Optimization – Our AMX-accelerated kernel is meticulously tuned, running several times faster than existing llama.cpp implementations. We plan to open-source this kernel after cleansing and are considering upstream contributions to llama.cpp.  
-
+   - Expert Offload: Unlike traditional layer-based or KVCache offloading (as seen in llama.cpp), we offload the expert computation to the CPU and MLA/KVCache to GPU, aligning perfectly with DeepSeek’s architecture for optimal efficiency.
+   - Intel AMX Optimization – Our AMX-accelerated kernel is meticulously tuned, running several times faster than existing llama.cpp implementations. We plan to open-source this kernel after cleansing and are considering upstream contributions to llama.cpp.
 5. Why Intel CPUs?
-Intel is currently the only CPU vendor that supports AMX-like instructions, which delivers significantly better performance compared to AVX-only alternatives.
+   Intel is currently the only CPU vendor that supports AMX-like instructions, which delivers significantly better performance compared to AVX-only alternatives.
+
 ## Next
+
 ### Faster
+
 * The FlashInfer (https://github.com/flashinfer-ai/flashinfer) project is releasing an even more efficient fused MLA operator, promising further speedups
 * vLLM has explored multi-token prediction in DeepSeek-V3, and support is on our roadmap for even better performance
 * We are collaborating with Intel to enhance the AMX kernel (v0.3) and optimize for Xeon6/MRDIMM
+
 ### Easier
+
 * Official Docker images to simplify installation
 * Fix the server integration for web API access
 * Fix the local chat only accepting a single line prompt (currently \n begins generating prompt)
 * Support for more quantization types, including the highly requested dynamic quantization from unsloth

-Stay tuned for more updates! 
+Stay tuned for more updates!
+
 ## FAQ
+
 ### R1 No Thinking
+
 Attention! If you are testing R1 and it may skip thinking. So you can add arg: `--force_think true`. The detail is in [FAQ](./FAQ.md) part <br>

 ### More FAQ
+
 [See detail](./FAQ.md)
--- a/doc/en/install.md
+++ b/doc/en/install.md
 <!-- omit in toc -->
+
 # How to Run DeepSeek-R1
+
 - [Preparation](#preparation)
 - [Installation](#installation)
  - [Attention](#attention)
  - [Supported models include:](#supported-models-include)
  - [Support quantize format:](#support-quantize-format)

-In this document, we will show you how to install and run KTransformers on your local machine. There are two versions: 
+In this document, we will show you how to install and run KTransformers on your local machine. There are two versions:
+
 * V0.2 is the current main branch.
 * V0.3 is a preview version only provides binary distribution for now.
 * To reproduce our DeepSeek-R1/V3 results, please refer to [Deepseek-R1/V3 Tutorial](./DeepseekR1_V3_tutorial.md) for more detail settings after installation.
+
 ## Preparation
+
 Some preparation:

 - CUDA 12.1 and above, if you didn't have it yet, you may install from [here](https://developer.nvidia.com/cuda-downloads).
-  
+
  ```sh
  # Adding CUDA to PATH
  if [ -d "/usr/local/cuda/bin" ]; then
@@ -32,39 +37,42 @@ Some preparation:
      export CUDA_PATH=$CUDA_PATH:/usr/local/cuda
  fi
  ```
-
 - Linux-x86_64 with gcc, g++ and cmake (using Ubuntu as an example)
-  
+
  ```sh
-  sudo apt-get update
-  sudo apt-get install build-essential cmake ninja-build
+  sudo apt-get update 
+  sudo apt-get install build-essential cmake ninja-build patchelf
  ```
-
 - We recommend using [Miniconda3](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) or [Anaconda3](https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh) to create a virtual environment with Python=3.11 to run our program. Assuming your Anaconda installation directory is `~/anaconda3`, you should ensure that the version identifier of the GNU C++standard library used by Anaconda includes `GLIBCXX-3.4.32`

-  
  ```sh
  conda create --name ktransformers python=3.11
  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
-  
+
  conda install -c conda-forge libstdcxx-ng # Anaconda provides a package called `libstdcxx-ng` that includes a newer version of `libstdc++`, which can be installed via `conda-forge`.

  strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
  ```
-
 - Make sure that PyTorch, packaging, ninja is installed You can also [install previous versions of PyTorch](https://pytorch.org/get-started/previous-versions/)
-  
+
  ```
  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
  pip3 install packaging ninja cpufeature numpy
  ```
-
- - At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.
+- At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.

 ## Installation
+
 ### Attention
+
 If you want to use numa support, not only do you need to set USE_NUMA=1, but you also need to make sure you have installed the libnuma-dev (`sudo apt-get install libnuma-dev` may help you).

+[Optional] If you want to use the multi-concurrent version, please install the following dependencies.
+
+```
+sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libgflags-dev zlib1g-dev libfmt-dev
+```
+
 <!-- 1. ~~Use a Docker image, see [documentation for Docker](./doc/en/Docker.md)~~
   
   >We are working on the latest docker image, please wait for a while.
@@ -78,46 +86,53 @@ If you want to use numa support, not only do you need to set USE_NUMA=1, but you
   
   for windows we prepare a pre compiled whl package on [ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl](https://github.com/kvcache-ai/ktransformers/releases/download/v0.2.0/ktransformers-0.2.0+cu125torch24avx2-cp312-cp312-win_amd64.whl), which require cuda-12.5, torch-2.4, python-3.11, more pre compiled package are being produced.  -->

+
 * Download source code and compile:
-   
-   - init source code 
-     
-     ```sh
-     git clone https://github.com/kvcache-ai/ktransformers.git
-     cd ktransformers
-     git submodule init
-     git submodule update
-     ```
-
-   - [Optional] If you want to run with website, please [compile the website](./api/server/website.md) before execute ```bash install.sh```
-
-   - For Linux
-     - For simple install:
-     
-        ```shell
-        bash install.sh
-        ```
-     - For those who have two cpu and 1T RAM:
-
-       ```shell
-        # Make sure your system has dual sockets and double size RAM than the model's size (e.g. 1T RAM for 512G model)
-        apt install libnuma-dev
-        export USE_NUMA=1
-        bash install.sh # or #make dev_install
-        ```
-
-   - For Windows
-     
-     ```shell
-     install.bat
-     ```
-
-* If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./makefile_usage.md) 
+
+  - init source code
+
+    ```sh
+    git clone https://github.com/kvcache-ai/ktransformers.git
+    cd ktransformers
+    git submodule update --init --recursive
+    ```
+  - [Optional] If you want to run with website, please [compile the website](./api/server/website.md) before execute ``bash install.sh``
+  - For Linux
+
+    - For simple install:
+
+      ```shell
+      bash install.sh
+      ```
+    - For those who have two cpu and 1T RAM:
+
+      ```shell
+      # Make sure your system has dual sockets and double size RAM than the model's size (e.g. 1T RAM for 512G model)
+       apt install libnuma-dev
+       export USE_NUMA=1
+       bash install.sh # or #make dev_install
+      ```
+    - For Multi-concurrency with 500G RAM:
+
+      ```shell
+      sudo env USE_BALANCE_SERVE=1 PYTHONPATH="\$(which python)" PATH="\$(dirname \$(which python)):\$PATH" bash ./install.sh
+      ```
+    - For Multi-concurrency with two cpu and 1T RAM:
+
+      ```shell
+      sudo env USE_BALANCE_SERVE=1 USE_NUMA=1 PYTHONPATH="\$(which python)" PATH="\$(dirname \$(which python)):\$PATH" bash ./install.sh
+      ```
+  - For Windows
+
+    ```shell
+    install.bat
+    ```
+* If you are developer, you can make use of the makefile to compile and format the code. <br> the detailed usage of makefile is [here](./makefile_usage.md)

 <h3>Local Chat</h3>
 We provide a simple command-line local chat Python script that you can run for testing.

-> Note: this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). 
+> Note: this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666).

 <h4>Run Example</h4>

@@ -141,57 +156,70 @@ python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Cha
 # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
 # python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
 ```
-
 It features the following arguments:

- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.  
-  
-  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+- `--model_path` (required): Name of the model (such as "deepseek-ai/DeepSeek-V2-Lite-Chat" which will automatically download configs from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)). Or if you already got local files  you may directly use that path to initialize the model.

+  > Note: <strong>.safetensors</strong> files are not required in the directory. We only need config files to build model and tokenizer.
+  >
 - `--gguf_path` (required): Path of a directory containing GGUF files which could that can be downloaded from [Hugging Face](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main). Note that the directory should only contains GGUF of current model, which means you need one separate directory for each model.
-
 - `--optimize_config_path` (required except for Qwen2Moe and DeepSeek-V2): Path of YAML file containing optimize rules. There are two rule files pre-written in the [ktransformers/optimize/optimize_rules](ktransformers/optimize/optimize_rules) directory for optimizing DeepSeek-V2 and Qwen2-57B-A14, two SOTA MoE models.
-
 - `--max_new_tokens`: Int (default=1000). Maximum number of new tokens to generate.
-
 - `--cpu_infer`: Int (default=10). The number of CPUs used for inference. Should ideally be set to the (total number of cores - 2).

+<h3>Start Server</h3>
+We provide a server script, which supports multi-concurrency functionality in version v0.2.4.
+
+```
+python ktransformers/server/main.py --model_path /mnt/data/models/DeepSeek-V3 --gguf_path /mnt/data/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M/ --cpu_infer 62 --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-serve.yaml --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
+```
+It features the following arguments:
+
+- `--chunk_size`: Maximum number of tokens processed in a single run by the engine.
+- `--cache_lens`: Total length of kvcache allocated by the scheduler. All requests share a kvcache space corresponding to 32768 tokens, and the space occupied will be released after the requests are completed.
+- `--backend_type`: `balance_serve` is a multi-concurrency backend engine introduced in version v0.2.4. The original single-concurrency engine is `ktransformers`.
+- `--max_batch_size`: Maximum number of requests (prefill + decode) processed in a single run by the engine. (Supported only by `balance_serve`)
+
 <details>
 <summary>Supported Models/quantization</summary>

 ### Supported models include:

-| ✅ **Supported Models** | ❌ **Deprecated Models** |
-|------------------------|------------------------|
-| DeepSeek-R1 | ~~InternLM2.5-7B-Chat-1M~~ |
-| DeepSeek-V3 |  |
-| DeepSeek-V2 |  |
-| DeepSeek-V2.5 |  |
-| Qwen2-57B |  |
-| DeepSeek-V2-Lite |  |
-| Mixtral-8x7B |  |
-| Mixtral-8x22B |  |
+
+| ✅**Supported Models** | ❌**Deprecated Models**    |
+| ---------------------- | -------------------------- |
+| DeepSeek-R1            | ~~InternLM2.5-7B-Chat-1M~~ |
+| DeepSeek-V3            |                            |
+| DeepSeek-V2            |                            |
+| DeepSeek-V2.5          |                            |
+| Qwen2-57B              |                            |
+| DeepSeek-V2-Lite       |                            |
+| Mixtral-8x7B           |                            |
+| Mixtral-8x22B          |                            |

 ### Support quantize format:

-| ✅ **Supported Formats** | ❌ **Deprecated Formats** |
-|--------------------------|--------------------------|
-| Q2_K_L | ~~IQ2_XXS~~ |
-| Q2_K_XS |  |
-| Q3_K_M |  |
-| Q4_K_M |  |
-| Q5_K_M |  |
-| Q6_K |  |
-| Q8_0 |  |
+
+| ✅**Supported Formats** | ❌**Deprecated Formats** |
+| ----------------------- | ------------------------ |
+| Q2_K_L                  | ~~IQ2_XXS~~              |
+| Q2_K_XS                 |                          |
+| Q3_K_M                  |                          |
+| Q4_K_M                  |                          |
+| Q5_K_M                  |                          |
+| Q6_K                    |                          |
+| Q8_0                    |                          |
+
 </details>

 <details>
 <summary>Suggested Model</summary>

+
 | Model Name                     | Model Size | VRAM  | Minimum DRAM    | Recommended DRAM  |
 | ------------------------------ | ---------- | ----- | --------------- | ----------------- |
-| DeepSeek-R1-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
-| DeepSeek-V3-q4_k_m		 | 377G       | 14G   | 382G            | 512G		    |
+| DeepSeek-R1-q4_k_m             | 377G       | 14G   | 382G            | 512G              |
+| DeepSeek-V3-q4_k_m             | 377G       | 14G   | 382G            | 512G              |
 | DeepSeek-V2-q4_k_m             | 133G       | 11G   | 136G            | 192G              |
 | DeepSeek-V2.5-q4_k_m           | 133G       | 11G   | 136G            | 192G              |
 | DeepSeek-V2.5-IQ4_XS           | 117G       | 10G   | 107G            | 128G              |
@@ -201,12 +229,11 @@ It features the following arguments:
 | Mixtral-8x22B-q4_k_m           | 80G        | 4G    | 86.1G           | 96G               |
 | InternLM2.5-7B-Chat-1M         | 15.5G      | 15.5G | 8G(32K context) | 150G (1M context) |

-
-More will come soon. Please let us know which models you are most interested in. 
+More will come soon. Please let us know which models you are most interested in.

 Be aware that you need to be subject to their corresponding model licenses when using [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/LICENSE) and [QWen](https://huggingface.co/Qwen/Qwen2-72B-Instruct/blob/main/LICENSE).
-</details>

+</details>

 <details>
  <summary>Click To Show how to run other examples</summary>
@@ -228,9 +255,8 @@ Be aware that you need to be subject to their corresponding model licenses when
  # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
  # python  ktransformers/local_chat.py --model_path ./Qwen2-57B-A14B-Instruct --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
  ```
-
 * Deepseek-V2
-  
+
  ```sh
  mkdir DeepSeek-V2-Chat-0628-GGUF && cd DeepSeek-V2-Chat-0628-GGUF
  # Download weights
@@ -250,40 +276,38 @@ Be aware that you need to be subject to their corresponding model licenses when
  # python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
  ```

-| model name | weights download link |
-|----------|----------|
-| Qwen2-57B | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main) |
-| DeepseekV2-coder |[DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
-| DeepseekV2-chat |[DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main) |
-| DeepseekV2-lite | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main) |
-| DeepSeek-R1 | [DeepSeek-R1-gguf-Q4K-M](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M) |
+
+| model name       | weights download link                                                                                                 |
+| ---------------- | --------------------------------------------------------------------------------------------------------------------- |
+| Qwen2-57B        | [Qwen2-57B-A14B-gguf-Q4K-M](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/tree/main)                       |
+| DeepseekV2-coder | [DeepSeek-Coder-V2-Instruct-gguf-Q4K-M](https://huggingface.co/LoneStriker/DeepSeek-Coder-V2-Instruct-GGUF/tree/main) |
+| DeepseekV2-chat  | [DeepSeek-V2-Chat-gguf-Q4K-M](https://huggingface.co/bullerwins/DeepSeek-V2-Chat-0628-GGUF/tree/main)                 |
+| DeepseekV2-lite  | [DeepSeek-V2-Lite-Chat-GGUF-Q4K-M](https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/tree/main)                |
+| DeepSeek-R1      | [DeepSeek-R1-gguf-Q4K-M](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M)                |

 </details>

 <!-- pin block for jump -->
-<span id='id_666'> 

-<h3>RESTful API and Web UI  </h3>
+<span id='id_666'>

+<h3>RESTful API and Web UI  </h3>

 Start without website:

 ```sh
 ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF --port 10002
 ```
-
 Start with website:

 ```sh
 ktransformers --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path /path/to/DeepSeek-V2-Lite-Chat-GGUF  --port 10002 --web True
 ```
-
 Or you want to start server with transformers, the model_path should include safetensors

 ```bash
 ktransformers --type transformers --model_path /mnt/data/model/Qwen2-0.5B-Instruct --port 10002 --web True
 ```
-
 Access website with url [http://localhost:10002/web/index.html#/chat](http://localhost:10002/web/index.html#/chat) :

 <p align="center">

--- a/doc/zh/DeepseekR1_V3_tutorial_zh.md
+++ b/doc/zh/DeepseekR1_V3_tutorial_zh.md
 <!-- omit in toc -->
+
 # GPT-4/o1 级别本地 VSCode Copilot 在仅 24GB 显存的台式机上的表现
+
 - [摘要](#摘要)
-	- [先决条件](#先决条件)
-	- [基准测试结果](#基准测试结果)
-		- [V0.2](#v02)
-			- [设置](#设置)
-			- [内存占用](#内存占用)
-			- [基准测试结果](#基准测试结果)
-		- [V0.3-Preview](#V0.3-Preview)
-			- [设置](#设置-1)
-			- [内存占用](#内存占用-1)
-			- [基准测试结果](#基准测试结果-1)
-	- [如何运行](#如何运行)
-		- [V0.2 展示](#v02-展示)
-			- [单插槽版本 (32 核心)](#单插槽版本（32 核心）)
-			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）)
-		- [V0.3 展示](#v03-展示)
-			- [双插槽版本 (64 核心)](#双插槽版本（64 核心）-1)
-	- [一些解释](#一些解释)
-	- [常见问题解答](#常见问题解答)
-		- [R1 不思考](#R1 不返回思考过程)
-		- [更多常见问题解答](#更多常见问题解答)
+  - [先决条件](#先决条件)
+  - [基准测试结果](#基准测试结果)
+    - [V0.2](#v02)
+      - [设置](#设置)
+      - [内存占用](#内存占用)
+      - [基准测试结果](#基准测试结果)
+    - [V0.3-Preview](#V0.3-Preview)
+      - [设置](#设置-1)
+      - [内存占用](#内存占用-1)
+      - [基准测试结果](#基准测试结果-1)
+  - [如何运行](#如何运行)
+    - [V0.2 展示](#v02-展示)
+      - [单插槽版本 (32 核心)](#单插槽版本（32 核心）)
+      - [双插槽版本 (64 核心)](#双插槽版本（64 核心）)
+    - [V0.3 展示](#v03-展示)
+      - [双插槽版本 (64 核心)](#双插槽版本（64 核心）-1)
+  - [一些解释](#一些解释)
+  - [常见问题解答](#常见问题解答)
+    - [R1 不思考](#R1 不返回思考过程)
+    - [更多常见问题解答](#更多常见问题解答)

 # 摘要

@@ -37,74 +39,125 @@ https://github.com/user-attachments/assets/ebd70bfa-b2c1-4abb-ae3b-296ed38aa285
 </p>

 - **[NEW!!!] 本地 671B DeepSeek-Coder-V3/R1:** 仅使用 14GB 显存和 382GB 内存运行其 Q4_K_M 版本。
-	- 预填充(Prefill)速度 (tokens/s): 
- 		- KTransformers: 54.21 (32 核心) → 74.362 (双插槽，2×32 核心) → 255.26 (优化的 AMX 基 MoE 内核，仅 V0.3) → 286.55 (选择性使用 6 个专家，仅 V0.3)  
- 		- 与 llama.cpp 在 2×32 核心下 10.31 tokens/s 相比，速度提升高达 **27.79 倍**
- 	- 解码(Decode)速度 (tokens/s):  
- 		- KTransformers: 8.73 (32 核心) → 11.26 (双插槽， 2×32 核心) → 13.69 (选择性使用 6 个专家，仅 V0.3)  
- 		- 与 llama.cpp 在 2×32 核心下 4.51 tokens/s 相比，速度提升高达 **3.03 倍**
-
-	
-我们还提供了即将推出的优化预览，包括英特尔 AMX 加速内核和选择性专家激活方法，这将显著提升性能。通过 V0.3 预览版，我们在预填充方面实现了高达 286 tokens/s 的速度，比本地推理的 llama.cpp **快 28 倍**。二进制发行版现已可用，源代码即将推出！请查看 wheel 包 [此处](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl) 。
+  - 预填充(Prefill)速度 (tokens/s):
+    - KTransformers: 54.21 (32 核心) → 74.362 (双插槽，2×32 核心) → 255.26 (优化的 AMX 基 MoE 内核，仅 V0.3) → 286.55 (选择性使用 6 个专家，仅 V0.3)
+    - 与 llama.cpp 在 2×32 核心下 10.31 tokens/s 相比，速度提升高达 **27.79 倍**
+  - 解码(Decode)速度 (tokens/s):
+    - KTransformers: 8.73 (32 核心) → 11.26 (双插槽， 2×32 核心) → 13.69 (选择性使用 6 个专家，仅 V0.3)
+    - 与 llama.cpp 在 2×32 核心下 4.51 tokens/s 相比，速度提升高达 **3.03 倍**

+我们还提供了即将推出的优化预览，包括英特尔 AMX 加速内核和选择性专家激活方法，这将显著提升性能。通过 V0.3 预览版，我们在预填充方面实现了高达 286 tokens/s 的速度，比本地推理的 llama.cpp **快 28 倍**。二进制发行版现已可用，源代码即将推出！请查看 wheel 包 [此处](https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl) 。

 ## 先决条件
+
 我们在以下配置下进行了最佳性能测试（V0.2）： <br>
 CPU: Intel (R) Xeon (R) Gold 6454S 1T 内存 (2 NUMA 节点) <br>
 GPU: 4090D 24G 显存 <br>
 内存: 标准 DDR5-4800 服务器内存 (1 TB)
+
 ## 基准测试结果
+
 ### V0.2
+
 #### 设置
+
 - Model: DeepseekV3-q4km (int4)<br>
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
 - GPU: 4090D 24G 显存
 - 我们在充分预热后进行测试
+
 #### 内存占用:
-  - 单插槽: 382G 内存，至少 14GB 显存
-  - 双插槽: 1T 内存，至少 14GB 显存
+
+- 单插槽: 382G 内存，至少 14GB 显存
+- 双插槽: 1T 内存，至少 14GB 显存

 #### 基准测试结果

 “6 个专家” 情况是 V0.3 预览版中内容

-| Prompt<br>(500 tokens) | 双插槽 Ktrans (6 个专家) | 双插槽 Ktrans (8 个专家) | Single socket Ktrans (6 个专家) | Single socket Ktrans (8 个专家)| llama.cpp (8 个专家) | 
-|------------------------| --- | --- | --- | --- | --- | 
-| 预填充(Prefill) token/s   | 97.32 | 82.94 | 65.14 | 54.21 | 10.31 |
-| 解码(Decode) token/s     | 13.69 | 12.208 | 10.303 | 8.73 |4.51 |
+
+| Prompt<br>(500 tokens)  | 双插槽 Ktrans (6 个专家) | 双插槽 Ktrans (8 个专家) | Single socket Ktrans (6 个专家) | Single socket Ktrans (8 个专家) | llama.cpp (8 个专家) |
+| ----------------------- | ------------------------ | ------------------------ | ------------------------------- | ------------------------------- | -------------------- |
+| 预填充(Prefill) token/s | 97.32                    | 82.94                    | 65.14                           | 54.21                           | 10.31                |
+| 解码(Decode) token/s    | 13.69                    | 12.208                   | 10.303                          | 8.73                            | 4.51                 |

 **最高加速比在解码方面达到 <u>3.03x</u> 倍，在预填充方面达到 <u>9.44x</u> 倍。**

 ### V0.3-Preview
+
 #### 设置
+
 - Model: DeepseekV3-BF16 (在线量化为 CPU 的 int8 和 GPU 的 int4)
 - CPU: cpu_model_name: Intel (R) Xeon (R) Gold 6454S，每个插槽 32 核心，2 个插槽，2 个 NUMA 节点
 - GPU: (1~4)x 4090D 24G 显存 (更长的 prompt 需要更多显存)

 #### 内存占用:
+
 - 644GB 内存，至少 14GB 显存

 #### 基准测试结果
-| Prompt length  | 1K  | 2K  | 4K  | 8K |
-|---------------|-----|-----|-----|-----|
-| KTrans (8 个专家) Prefill token/s |   185.96  |  255.26   |  252.58   |  195.62   |
-| KTrans (6 个专家) Prefill token/s |   203.70  |  286.55   |  271.08   |  207.20   |
+
+
+| Prompt length                     | 1K     | 2K     | 4K     | 8K     |
+| --------------------------------- | ------ | ------ | ------ | ------ |
+| KTrans (8 个专家) Prefill token/s | 185.96 | 255.26 | 252.58 | 195.62 |
+| KTrans (6 个专家) Prefill token/s | 203.70 | 286.55 | 271.08 | 207.20 |

 **KTrans V0.3 的预填充速度比 KTrans V0.2 快 <u>3.45x</u> 倍，比 llama.cpp 快 <u>27.79x</u> 倍。**
 **解码速度与 KTrans V0.2（6 个专家版本）相同，因此省略。**

-主要加速来自于 
+主要加速来自于
+
 - 英特尔 AMX 指令集和我们专门设计的缓存友好内存布局
 - 专家选择策略，根据离线配置文件结果选择更少的专家

-
 *从我们对 DeepSeekV2、DeepSeekV3 和 DeepSeekR1 的研究中，当我们略微减少推理中的激活专家数量时，输出质量没有变化。但解码和预填充的速度加快了，这令人鼓舞。因此，我们的展示利用了这一发现。*

 ## 如何运行
+
+### 多并发展示
+
+多并发需要额外编译调度器 c++ 代码
+
+```shell
+sudo apt install libtbb-dev libssl-dev libcurl4-openssl-dev libaio1 libaio-dev libfmt-dev
+sudo apt-get install libgflags-dev zlib1g-dev patchelf
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule update --init --recursive
+# 如果使用双 numa 版本
+sudo env USE_BALANCE_SERVE=1 USE_NUMA=1 PYTHONzPATH="$(which python)" PATH="$(dirname $(which python)):$PATH" bash ./install.sh
+# 如果使用单 numa 版本
+sudo env USE_BALANCE_SERVE=1 PYTHONzPATH="$(which python)" PATH="$(dirname $(which python)):$PATH" bash ./install.sh
+# 启动命令
+python ktransformers/server/main.py --model_path <your model path> --gguf_path <your gguf path> --cpu_infer 62 --optimize_config_path <inject rule path> --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
+```
+
+`<your model path>` 可以是本地路径，也可以是在线路径，例如 deepseek-ai/DeepSeek-V3。如果在线连接出现问题，可以尝试使用镜像（hf-mirror.com） <br>
+`<your gguf path>` 也可以是在线路径，但由于其体积较大，我们建议您下载并量化模型（注意这是目录路径）
+
+`<inject rule path>` 注入规则 yaml 文件地址，我们在 `ktransformers/optimize/optimize_rules/ ` 目录下提供了 `DeepSeek-V3-Chat-serve.yaml` 和 `DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml` 分别对应 [`DeepSeek-V3/R1-q4km`](https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-Q4_K_M) 和 [`DeepSeek-V3/R1-hybrid`](https://huggingface.co/KVCache-ai/DeepSeek-R1-GGML-FP8-Hybrid/tree/main)
+
+`--max_new_tokens 1000` 是最大输出 token 长度。如果发现答案被截断，可以增加此数字以获得更长的答案（但要注意内存不足问题，增加此数字会降低生成速度）.
+
+`--chunk_size 256` 引擎单次运行最大 token 个数
+
+`--cache_lens 32768`  调度器申请 kvcache 的总长度。所有请求共享 32768 个 tokens 对应 kvcache 空间，请求完成后会释放其所占用的 kvcache 空间。
+
+`--backend_type balance_serve` `balance_serve`是 v0.2.4新增的后端引擎，原本的单并发引擎为`ktransformers`
+
+`--max_batch_size 4` 引擎单次运行最多处理 4 个请求(prefill + decode),(仅用于`balance_serve`)
+
+<br>命令 numactl -N 1 -m 1 的目的是避免 NUMA 节点之间的数据传输<br>
+注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think`，这在 [常见问题解答](#常见问题解答) 部分中解释。
+
 ### V0.2 展示
+
 #### 单插槽版本（32 核心）
+
 我们的 local_chat 测试命令是:
-``` shell
+
+```shell
 git clone https://github.com/kvcache-ai/ktransformers.git
 cd ktransformers
 git submodule init
@@ -112,17 +165,13 @@ git submodule update
 numactl -N 1 -m 1 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 33 --max_new_tokens 1000
 <当您看到聊天时，按回车键加载文本提示文件>
 ```
-`<your model path>` 可以是本地路径，也可以是在线路径，例如 deepseek-ai/DeepSeek-V3。如果在线连接出现问题，可以尝试使用镜像（hf-mirror.com） <br>
-`<your gguf path>` 也可以是在线路径，但由于其体积较大，我们建议您下载并量化模型（注意这是目录路径） <br>
-`--max_new_tokens 1000` 是最大输出 token 长度。如果发现答案被截断，可以增加此数字以获得更长的答案（但要注意内存不足问题，增加此数字会降低生成速度）. 
-<br>
-命令 numactl -N 1 -m 1 的目的是避免 NUMA 节点之间的数据传输<br>
-注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`，这在 [常见问题解答](#常见问题解答) 部分中解释。

 #### 双插槽版本（64 核心）
+
 在安装之前（使用 install.sh 或 `make dev_install`），请确保设置环境变量 `USE_NUMA=1`，方法是 `export USE_NUMA=1`（如果已经安装，请重新安装并设置此环境变量） <br>
 我们的 local_chat 测试命令是：
-``` shell
+
+```shell
 git clone https://github.com/kvcache-ai/ktransformers.git
 cd ktransformers
 git submodule init
@@ -132,42 +181,48 @@ make dev_install # or sh ./install.sh
 python ./ktransformers/local_chat.py --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <当您看到聊天时，按回车键加载文本提示文件>
 ```
+
 参数的含义相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。

 ### V0.3 展示
+
 #### 双插槽版本（64 核心）
+
 我们的 local_chat 测试命令是：
-``` shell
+
+```shell
 wget https://github.com/kvcache-ai/ktransformers/releases/download/v0.1.4/ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 pip install ./ktransformers-0.3.0rc0+cu126torch26fancy-cp311-cp311-linux_x86_64.whl
 python -m ktransformers.local_chat --model_path <your model path> --gguf_path <your gguf path>  --prompt_file <your prompt txt file>  --cpu_infer 65 --max_new_tokens 1000
 <当您看到聊天时，按回车键加载文本提示文件>
 ```
+
 参数的含义与 V0.2 相同。但因为我们使用双插槽，所以将 cpu_infer 设置为 65。

 ## 一些解释
-1. 我们还想进一步利用 Xeon Gold CPU 上的两个 NUMA 节点。为了避免节点之间的数据传输成本，我们在两个节点上 "copy" 了关键矩阵，这会增加内存占用，但会加速预填充和解码过程。但这种方法占用大量内存，加载权重时速度较慢，因此加载时请耐心等待并监控内存使用情况。我们计划优化这一巨大的内存开销。敬请期待。

+1. 我们还想进一步利用 Xeon Gold CPU 上的两个 NUMA 节点。为了避免节点之间的数据传输成本，我们在两个节点上 "copy" 了关键矩阵，这会增加内存占用，但会加速预填充和解码过程。但这种方法占用大量内存，加载权重时速度较慢，因此加载时请耐心等待并监控内存使用情况。我们计划优化这一巨大的内存开销。敬请期待。
 2. 命令参数 `--cpu_infer 65` 指定使用多少核心（超过物理核心数量是可以的，但并不是越多越好。根据实际核心数量适当降低此值）。<br>
-
 3. 为什么使用 CPU/GPU 混合推理？
-DeepSeek 的 MLA 操作符计算密集。虽然全部在 CPU 上运行是可行的，但将繁重的计算任务卸载到 GPU 上能带来巨大的性能提升。
-
+   DeepSeek 的 MLA 操作符计算密集。虽然全部在 CPU 上运行是可行的，但将繁重的计算任务卸载到 GPU 上能带来巨大的性能提升。
 4. 加速来自哪里？

   - 专家卸载：与传统的基于层或 KVCache 卸载（如 llama.cpp 中的）不同，我们将专家计算卸载到 CPU，将 MLA/KVCache 卸载到 GPU，与 DeepSeek 的架构完美对齐，实现最佳效率。
-   - 英特尔 AMX 优化 – 我们的 AMX 加速内核经过精心调优，运行速度是现有 llama.cpp 实现的数倍。我们计划在清理后开源此内核，并考虑向 llama.cpp 上游贡献代码。 
-
+   - 英特尔 AMX 优化 – 我们的 AMX 加速内核经过精心调优，运行速度是现有 llama.cpp 实现的数倍。我们计划在清理后开源此内核，并考虑向 llama.cpp 上游贡献代码。
 5. 为什么选择英特尔 CPU？
-英特尔目前是唯一支持 AMX 类似指令的 CPU 供应商，与仅支持 AVX 的替代方案相比，性能显著更好。
+   英特尔目前是唯一支持 AMX 类似指令的 CPU 供应商，与仅支持 AVX 的替代方案相比，性能显著更好。

 ## 常见问题解答
+
 ### R1 不返回思考过程
+
 注意！如果测试 R1 可能会跳过思考。因此，可以添加参数：`--force_think true`。详细信息在 [常见问题解答](./FAQ.md) 部分中。 <br>

 ## 问题
+
 * 修复服务器集成功能以实现网络API访问支持
 * 修复本地聊天功能仅支持单行提示输入的问题（目前输入换行符(\n)即开始生成提示）

 ### 更多常见问题解答
+
 [详见](./FAQ.md)
--- a/install.sh
+++ b/install.sh
@@ -4,14 +4,23 @@ set -e
 # clear build dirs
 rm -rf build
 rm -rf *.egg-info
-rm -rf ktransformers/ktransformers_ext/build
-rm -rf ktransformers/ktransformers_ext/cuda/build
-rm -rf ktransformers/ktransformers_ext/cuda/dist
-rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
-
+rm -rf csrc/build
+rm -rf csrc/ktransformers_ext/build
+rm -rf csrc/ktransformers_ext/cuda/build
+rm -rf csrc/ktransformers_ext/cuda/dist
+rm -rf csrc/ktransformers_ext/cuda/*.egg-info
+rm -rf ~/.ktransformers
 echo "Installing python dependencies from requirements.txt"
 pip install -r requirements-local_chat.txt
-
+pip install -r ktransformers/server/requirements.txt
 echo "Installing ktransformers"
-KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation
+KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation
+pip install third_party/custom_flashinfer/
+
+SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+echo "Copying thirdparty libs to $SITE_PACKAGES"
+cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
+patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
+
+
 echo "Installation completed successfully"
\ No newline at end of file
--- a/ktransformers/configs/config.yaml
+++ b/ktransformers/configs/config.yaml
@@ -21,7 +21,8 @@ user:

 model:
  # type: transformers
-  type: ktransformers
+  type: balance_serve
+  # type: ktransformers

  name: DeepSeek-Coder-V2-Instruct
  path: deepseek-ai/DeepSeek-V2-Lite-Chat
@@ -29,7 +30,7 @@ model:

  device: cuda:0
  cache_lens: 8192
-
+  max_new_tokens: 500
 web:
  mount: False
  open_cross_domain: True
@@ -38,7 +39,6 @@ ext:
  cpu_infer: 10

 long_context:
-  chunk_size: 4096
  max_seq_len: 32000
  block_size: 128
  local_windows_len: 4096
@@ -54,4 +54,19 @@ long_context:
  token_step: 

 local_chat:
-  prompt_file: ""
\ No newline at end of file
+  prompt_file: ""
+
+async_server:
+  sched_strategy: "FCFS"
+  sched_port: 56441
+  sched_metrics_port: 54321
+  kvc2_metrics_port: 54391
+  max_batch_size: 4  # decode count + prefill count, in one mini batch
+
+attn:
+  page_size: 256
+  chunk_size: 256
+kvc2:
+  gpu_only: true 
+  utilization_percentage: 1.0
+  cpu_memory_size_GB: 500
--- a/ktransformers/configs/model_configs.json
+++ b/ktransformers/configs/model_configs.json
+{
+    "DeepSeek-Coder-V2-Instruct": {
+        "hidden_size": 5120,
+        "intermediate_size": 12288,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v2",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 60,
+        "num_key_value_heads": 128,
+        "vocab_size": 102400
+    },
+    "DeepSeek-R1": {
+        "hidden_size": 7168,
+        "intermediate_size": 18432,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 61,
+        "num_key_value_heads": 128,
+        "vocab_size": 129280
+    },
+    "DeepSeek-V2-Lite-Chat": {
+        "hidden_size": 2048,
+        "intermediate_size": 10944,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v2",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "num_key_value_heads": 16,
+        "vocab_size": 102400
+    },
+    "DeepSeek-V3": {
+        "hidden_size": 7168,
+        "intermediate_size": 18432,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 3,
+        "num_key_value_heads": 128,
+        "vocab_size": 129280
+    },
+    "DeepSeek-V3-bf16": {
+        "hidden_size": 7168,
+        "intermediate_size": 18432,
+        "max_position_embeddings": 163840,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 128,
+        "num_hidden_layers": 61,
+        "num_key_value_heads": 128,
+        "vocab_size": 129280
+    },
+    "LLaMA-2-7B-32K": {
+        "hidden_size": 4096,
+        "intermediate_size": 11008,
+        "max_position_embeddings": 32768,
+        "model_type": "llama",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 32,
+        "vocab_size": 32000
+    },
+    "Moonlight-16B-A3B-Instruct": {
+        "hidden_size": 2048,
+        "intermediate_size": 11264,
+        "max_position_embeddings": 8192,
+        "model_type": "deepseek_v3",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "num_key_value_heads": 16,
+        "vocab_size": 163840
+    },
+    "Qwen2.5-32B-Instruct": {
+        "hidden_size": 5120,
+        "intermediate_size": 27648,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 40,
+        "num_hidden_layers": 64,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    },
+    "Qwen2.5-32B-Instruct-GPTQ-Int4": {
+        "hidden_size": 5120,
+        "intermediate_size": 27648,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 40,
+        "num_hidden_layers": 64,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    },
+    "Qwen2.5-7B-Instruct": {
+        "hidden_size": 3584,
+        "intermediate_size": 18944,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 28,
+        "num_hidden_layers": 28,
+        "num_key_value_heads": 4,
+        "vocab_size": 152064
+    },
+    "Qwen2.5-7B-Instruct-GPTQ-Int4": {
+        "hidden_size": 3584,
+        "intermediate_size": 18944,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 28,
+        "num_hidden_layers": 28,
+        "num_key_value_heads": 4,
+        "vocab_size": 152064
+    },
+    "qwen2-72b-instruct": {
+        "hidden_size": 8192,
+        "intermediate_size": 29568,
+        "max_position_embeddings": 32768,
+        "model_type": "qwen2",
+        "num_attention_heads": 64,
+        "num_hidden_layers": 80,
+        "num_key_value_heads": 8,
+        "vocab_size": 152064
+    }
+}
\ No newline at end of file
--- a/ktransformers/configs/quant_configs.json
+++ b/ktransformers/configs/quant_configs.json
+{
+    "BF16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "BF16",
+        "reference": "",
+        "type_of_dot_vector": "BF16"
+    },
+    "FP16": {
+        "block_element_count": 1,
+        "block_element_size": 2,
+        "bytes_per_element": 2.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP16",
+        "reference": "",
+        "type_of_dot_vector": "FP16"
+    },
+    "FP32": {
+        "block_element_count": 1,
+        "block_element_size": 4,
+        "bytes_per_element": 4.0,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": false,
+        "name": "FP32",
+        "reference": "",
+        "type_of_dot_vector": "FP32"
+    },
+    "Q4_0": {
+        "block_element_count": 32,
+        "block_element_size": 18,
+        "bytes_per_element": 0.5625,
+        "can_be_used_as_vector": false,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q4_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    },
+    "Q8_0": {
+        "block_element_count": 32,
+        "block_element_size": 34,
+        "bytes_per_element": 1.0625,
+        "can_be_used_as_vector": true,
+        "has_min": false,
+        "has_scale": true,
+        "name": "Q8_0",
+        "reference": "https://huggingface.co/docs/hub/gguf",
+        "type_of_dot_vector": "Q8_0"
+    }
+}
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
+++ b/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/marlin_utils.py
@@ -114,6 +114,44 @@ def marlin_quantize(
    return res_list


+def vllm_marlin_quantize(
+    w: torch.Tensor,
+    num_bits: int,
+    group_size: int,
+    act_order: bool,
+):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
+                                                       act_order)
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    # Reformat to marlin
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
+                                marlin_perm[num_bits])
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
+                                     marlin_scale_perm[num_bits],
+                                     marlin_scale_perm_single[num_bits])
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
+
+
 def inject_24(w, size_k, size_n):
    assert w.shape == (size_k, size_n)


--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@@ -63,7 +63,7 @@ def local_chat(
    prompt_file : str | None = None,
    mode: str = "normal",
    force_think: bool = False,
-    chunk_prefill_size: int = 8192
+    chunk_size: int = 8192
 ):

    torch.set_grad_enabled(False)
@@ -172,12 +172,12 @@ def local_chat(
        
        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8 and device_manager.gpu_vendor == GPUVendor.NVIDIA:
            generated = prefill_and_generate(
-                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
            )
        else:
            generated = prefill_and_generate(
-                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size,
            )



--- a/ktransformers/models/configuration_deepseek_v3.py
+++ b/ktransformers/models/configuration_deepseek_v3.py
-# coding=utf-8
-# Copyright 2025 bzantium and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on the DeepSeekV3 implementations from the DeepSeek AI team. (https://huggingface.co/deepseek-ai/DeepSeek-V3)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""DeepSeekV3 model configuration"""
-
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging

+logger = logging.get_logger(__name__)

 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
 class DeepseekV3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V3.
-
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
-
-
    Args:
        vocab_size (`int`, *optional*, defaults to 129280):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV3Model`]
-        hidden_size (`int`, *optional*, defaults to 7168):
+        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 18432):
+        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
-        moe_intermediate_size (`int`, *optional*, defaults to 2048):
+        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
-        num_hidden_layers (`int`, *optional*, defaults to 61):
+        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 128):
+        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
+            Number of nextn predict layers in the DeepSeekV3 Model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 128):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        n_shared_experts (`int`, *optional*, defaults to 1):
-            Number of shared experts.
-        n_routed_experts (`int`, *optional*, defaults to 256):
-            Number of routed experts.
-        routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+        n_shared_experts (`int`, *optional*, defaults to None):
+            Number of shared experts, None means dense model.
+        n_routed_experts (`int`, *optional*, defaults to None):
+            Number of routed experts, None means dense model.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
-        kv_lora_rank (`int`, *optional*, defaults to 512):
-            Rank of the LoRA matrices for key and value projections.
-        q_lora_rank (`int`, *optional*, defaults to 1536):
-            Rank of the LoRA matrices for query projections.
-        qk_rope_head_dim (`int`, *optional*, defaults to 64):
-            Dimension of the query/key heads that use rotary position embeddings.
-        v_head_dim (`int`, *optional*, defaults to 128):
-            Dimension of the value heads.
-        qk_nope_head_dim (`int`, *optional*, defaults to 128):
-            Dimension of the query/key heads that don't use rotary position embeddings.
-        n_group (`int`, *optional*, defaults to 8):
+        topk_method (`str`, *optional*, defaults to `gready`):
+            Topk method used in routed gate.
+        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
-        topk_group (`int`, *optional*, defaults to 4):
+        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
-        num_experts_per_tok (`int`, *optional*, defaults to 8):
+        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
-        first_k_dense_replace (`int`, *optional*, defaults to 3):
+        moe_layer_freq (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
-        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
+        scoring_func (`str`, *optional*, defaults to 'softmax'):
+            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
+        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -98,15 +75,10 @@ class DeepseekV3Config(PretrainedConfig):
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 0):
+        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 1):
+        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
-        pretraining_tp (`int`, *optional*, defaults to 1):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
@@ -120,49 +92,44 @@ class DeepseekV3Config(PretrainedConfig):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-
    ```python
    >>> from transformers import DeepseekV3Model, DeepseekV3Config
-
    >>> # Initializing a Deepseek-V3 style configuration
    >>> configuration = DeepseekV3Config()
-
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "deepseek_v3"
    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `DeepseekV3Model`
-    base_model_tp_plan = {
-        "layers.*.gate_proj": "colwise",
-        "layers.*.up_proj": "colwise",
-        "layers.*.down_proj": "rowwise",
-    }

    def __init__(
        self,
        vocab_size=129280,
        hidden_size=7168,
        intermediate_size=18432,
-        moe_intermediate_size=2048,
+        moe_intermediate_size = 2048,
        num_hidden_layers=61,
+        num_nextn_predict_layers=1,
        num_attention_heads=128,
        num_key_value_heads=128,
-        n_shared_experts=1,
-        n_routed_experts=256,
-        routed_scaling_factor=2.5,
-        kv_lora_rank=512,
-        q_lora_rank=1536,
-        qk_rope_head_dim=64,
-        v_head_dim=128,
-        qk_nope_head_dim=128,
-        n_group=8,
-        topk_group=4,
-        num_experts_per_tok=8,
-        first_k_dense_replace=3,
-        norm_topk_prob=True,
-        aux_loss_alpha=0.001,
+        n_shared_experts = 1,
+        n_routed_experts = 256,
+        ep_size = 1,
+        routed_scaling_factor = 2.5,
+        kv_lora_rank = 512,
+        q_lora_rank = 1536,
+        qk_rope_head_dim = 64,
+        v_head_dim = 128,
+        qk_nope_head_dim = 128,
+        topk_method = 'noaux_tc',
+        n_group = 8,
+        topk_group = 4,
+        num_experts_per_tok = 8,
+        moe_layer_freq = 1,
+        first_k_dense_replace = 3,
+        norm_topk_prob = True,
+        scoring_func = 'sigmoid',
        hidden_act="silu",
        max_position_embeddings=4096,
        initializer_range=0.02,
@@ -171,7 +138,6 @@ class DeepseekV3Config(PretrainedConfig):
        pad_token_id=None,
        bos_token_id=0,
        eos_token_id=1,
-        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
@@ -185,24 +151,25 @@ class DeepseekV3Config(PretrainedConfig):
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
+        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
-        self.q_head_dim = qk_nope_head_dim + qk_rope_head_dim
-        self.head_dim = qk_rope_head_dim
+        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
-        self.aux_loss_alpha = aux_loss_alpha
-
+        self.scoring_func = scoring_func
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
@@ -211,17 +178,11 @@ class DeepseekV3Config(PretrainedConfig):
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
-        # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, copy it it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)

        super().__init__(
            pad_token_id=pad_token_id,
@@ -229,7 +190,4 @@ class DeepseekV3Config(PretrainedConfig):
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
-        )
-
-
-__all__ = ["DeepseekV3Config"]
\ No newline at end of file
+        )
\ No newline at end of file
--- a/ktransformers/models/custom_cache.py
+++ b/ktransformers/models/custom_cache.py
@@ -8,9 +8,11 @@ Version      : 0.1.0
 # Copyright 2018- The Hugging Face team. All rights reserved.
 # Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 import torch
+import torch.nn as nn
 import transformers
 from transformers import Cache, PretrainedConfig
 from typing import List, Optional, Dict, Any, Tuple
+from ktransformers.server.balance_serve.settings import sched_ext
 class StaticCache(transformers.StaticCache):
    """
    Static Cache class to be used with `torch.compile(model)`.
@@ -188,3 +190,85 @@ class StaticCache(transformers.StaticCache):
    def get_max_cache_shape(self) -> Tuple[int, int, int, int]:
        """Returns the maximum shape of the cache."""
        return self.max_cache_len
+
+class KDeepSeekV3Cache(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        page_size: int = 256,
+        dtype=torch.bfloat16,
+        device=torch.device("cuda:0"),
+        
+    ):
+        super().__init__()
+        self.config = config
+        self.dtype = dtype
+        self.device = device
+        self.kv_lora_rank = config.kv_lora_rank
+        self.page_size = page_size
+        self.k_caches = []
+        self.v_caches = []
+        
+
+    def load(self, inference_context: sched_ext.InferenceContext): 
+        
+        for i in range(self.config.num_hidden_layers):
+            self.k_caches.append(
+                inference_context.k_cache[0][i] 
+            )
+        self.max_cache_len = self.k_caches[0].shape[0]*self.k_caches[0].shape[1]
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+
+        page_idx: torch.Tensor,
+        page_offset: torch.Tensor,
+
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
+                to know how where to write in the cache.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        k_out = self.k_caches[layer_idx]
+
+        k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states.reshape(-1, *key_states.shape[2:])
+        k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states.reshape(-1, *value_states.shape[2:])
+        return k_out
+
+        
+    def get_page_table(self, cache_position: torch.Tensor, q_indptr: torch.Tensor, kv_indptr: torch.Tensor, kv_indices: torch.Tensor, bsz_tensors: torch.tensor):
+        page_offset = cache_position % self.page_size  
+        page_idx_local = cache_position // self.page_size  
+        query_ids = torch.zeros_like(cache_position)
+        for i in range(len(q_indptr) - 1):
+            start_idx = q_indptr[i]
+            end_idx = q_indptr[i + 1]
+            query_ids[start_idx:end_idx] = i
+        page_idx = torch.zeros_like(page_idx_local)
+        for i in range(bsz_tensors[0]):
+            query_id = query_ids[i]
+            local_block = page_idx_local[i]
+            start_block = kv_indptr[query_id]
+            if local_block < kv_indptr[query_id + 1] - kv_indptr[query_id]:
+                page_idx[i] = kv_indices[start_block + local_block]
+        
+        return page_idx, page_offset
+    
--- a/ktransformers/models/custom_modeling_deepseek_v2.py
+++ b/ktransformers/models/custom_modeling_deepseek_v2.py
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
+from ktransformers.models.custom_cache import KDeepSeekV3Cache
+from  ktransformers.models.modeling_deepseek import DeepseekV2Model,  DeepseekV2PreTrainedModel
+from ktransformers.models.configuration_deepseek import DeepseekV2Config
+
+
+torch.set_grad_enabled(False)
+torch.set_default_dtype(torch.bfloat16)
+import flashinfer
+
+class KDeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
+
+    kv_cache: KDeepSeekV3Cache
+    use_cuda_graph = False
+    def __init__(
+        self,
+        config,
+        kv_cache,
+
+    ):
+        super().__init__(config)
+        self.model = DeepseekV2Model(config)
+        self.config = config
+        self.kv_cache = kv_cache
+
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        
+
+    def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
+        self.use_cuda_graph = use_cuda_graph
+        self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
+        self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
+        self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
+        self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
+        self.paged_kv_len_buf = torch.empty((max_batch_size,), dtype=torch.int32, device=device)
+
+		
+
+        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
+            self.workspace_buffer, use_cuda_graph=use_cuda_graph,
+            qo_indptr=self.qo_indptr_buf,kv_indptr=self.paged_kv_indptr_buf,
+            kv_indices=self.paged_kv_indices_buf,kv_len_arr=self.paged_kv_len_buf
+        )
+
+    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
+        features = []
+        for i in range(batch.batch_size):
+            tokens = batch.minibatch.tokens.contiguous()
+            feature = (
+                self.model.embed_tokens(tokens.to(torch.device('cpu')))
+                .to(torch.bfloat16)
+                .to(device=device)
+            )
+            features.append(feature)
+
+        return features
+
+
+    def forward(
+        self,
+        batch: ForwardBatchInput | None = None,
+        features: List[torch.Tensor] | None = None,
+        bsz_tensors: torch.Tensor | None = None,
+        num_tokens_tensors: torch.Tensor | None = None,
+        page_idx: torch.Tensor | None = None,
+        page_offset: torch.Tensor | None = None,
+    ) -> ForwardBatchOutput:
+        current_stream = torch.cuda.current_stream()
+
+        forward_batch_output = ForwardBatchOutput()
+
+        
+        hidden_states = features[0]
+
+
+        with torch.cuda.stream(current_stream):
+            residual = torch.zeros_like(hidden_states)
+            for i, decode_layer in enumerate(self.model.layers):
+                if self.model.transfer_map is not None and i in self.model.transfer_map:
+                    prev_stream = torch.cuda.current_stream()
+                    cur_device = self.model.transfer_map[i]
+                    if cur_device not in self.model.stream_device_map:
+                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
+                    torch.cuda.set_device(cur_device)
+                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
+                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
+                    hidden_states = hidden_states.to(
+                        self.model.transfer_map[i], non_blocking=True
+                    )
+
+                    batch.minibatch.position_ids = (
+                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
+                        if batch.minibatch.position_ids is not None
+                        else None
+                    )
+                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
+                hidden_states = decode_layer.self_attn(hidden_states, self.kv_cache, 
+                                                       position_ids=batch.minibatch.position_ids, 
+                                                       wrapper=self.wrapper, bsz_tensors=num_tokens_tensors, 
+                                                       cache_position=batch.minibatch.positions, 
+                                                       batch_indices=batch.minibatch.batch_indices,
+                                                       kv_indices=batch.minibatch.kv_indices,
+                                                       kv_indptr=batch.minibatch.kv_indptr,
+                                                       kv_last_page_len=batch.minibatch.kv_last_page_len,
+                                                       q_indptr=batch.minibatch.q_indptr,
+                                                       page_idx=page_idx,
+                                                       page_offset=page_offset
+                                                       )
+
+                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
+                if i < 3:
+                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors)
+                else:
+                    hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors)
+                    hidden_states = hidden_states.squeeze(0)
+        forward_batch_output = ForwardBatchOutput()
+        assert  batch.batch_size == 1
+        with torch.cuda.stream(current_stream):
+
+            local_logit = self.lm_head(self.model.norm(hidden_states[batch.minibatch.logits_start], num_tokens_tensors, residual[batch.minibatch.logits_start])[0])
+            # local_logit = local_logit[batch.minibatch.logits_start]
+            forward_batch_output.logits.append(local_logit)
+
+        return forward_batch_output
+    
+
+               
+    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
+        num_heads: int,
+        head_dim_ckv: int,
+        head_dim_kpe: int,
+        page_size: int,
+        causal: bool,
+        sm_scale: float,
+        q_data_type: torch.dtype,
+        kv_data_type: torch.dtype,):
+        minibatch = batch.minibatch
+        
+        self.wrapper.plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
+                          minibatch.kv_len, num_heads, head_dim_ckv, head_dim_kpe, page_size, causal, sm_scale, q_data_type, kv_data_type)
+        
\ No newline at end of file
--- a/ktransformers/models/custom_modeling_deepseek_v3.py
+++ b/ktransformers/models/custom_modeling_deepseek_v3.py
+"""
+Date: 2024-11-06 10:05:11
+LastEditors: djw
+LastEditTime: 2024-11-13 07:50:51
+"""
+
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from ktransformers.server.balance_serve.inference.forward_batch import ForwardBatchInput, ForwardBatchOutput
+from ktransformers.models.custom_cache import KDeepSeekV3Cache
+from ktransformers.models.modeling_deepseek_v3 import DeepseekV3Model,  DeepseekV3PreTrainedModel
+from ktransformers.models.configuration_deepseek_v3 import DeepseekV3Config
+
+
+torch.set_grad_enabled(False)
+torch.set_default_dtype(torch.bfloat16)
+import flashinfer
+
+class KDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
+
+    cache: KDeepSeekV3Cache
+    use_cuda_graph = False
+    def __init__(
+        self,
+        config: DeepseekV3Config,
+        cache,
+    ):
+        super().__init__(config)
+        self.model = DeepseekV3Model(config)
+        self.config = config
+        self.cache = cache
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        
+    def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
+        self.use_cuda_graph = use_cuda_graph
+        self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
+        self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
+        self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
+        self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
+        self.paged_kv_len_buf = torch.empty((max_batch_size,), dtype=torch.int32, device=device)
+        self.bsz_tensor_buf = torch.empty((1, ), dtype=torch.int32, device=device)
+		
+
+        self.wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
+            self.workspace_buffer, use_cuda_graph=use_cuda_graph,
+            qo_indptr=self.qo_indptr_buf,kv_indptr=self.paged_kv_indptr_buf,
+            kv_indices=self.paged_kv_indices_buf,kv_len_arr=self.paged_kv_len_buf,
+            bsz_tensor=self.bsz_tensor_buf
+        )
+
+    def batch_embeddings(self, batch: ForwardBatchInput, device="cuda:0"):
+        features = []
+        for i in range(batch.batch_size):
+            tokens = batch.minibatch.tokens.contiguous()
+            feature = (
+                self.model.embed_tokens(tokens.to(torch.device('cpu')))
+                .to(torch.bfloat16)
+                .to(device=device)
+            )
+            features.append(feature)
+
+        return features
+
+
+    def forward(
+        self,
+        batch: ForwardBatchInput | None = None,
+        features: List[torch.Tensor] | None = None,
+        bsz_tensors: torch.Tensor | None = None,
+        num_tokens_tensors: torch.Tensor | None = None,
+        page_idx: torch.Tensor | None = None,
+        page_offset: torch.Tensor | None = None,
+        cuda_graph_idx: int | None = -1
+    ) -> ForwardBatchOutput:
+        current_stream = torch.cuda.current_stream()
+
+        forward_batch_output = ForwardBatchOutput()
+
+        
+        hidden_states = features[0]
+
+        with torch.cuda.stream(current_stream):
+            residual = torch.zeros_like(hidden_states)
+            for i, decode_layer in enumerate(self.model.layers):
+                # can't use now, only one flashinfer wrapper
+                if self.model.transfer_map is not None and i in self.model.transfer_map:
+                    prev_stream = torch.cuda.current_stream()
+                    cur_device = self.model.transfer_map[i]
+                    if cur_device not in self.model.stream_device_map:
+                        self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
+                    torch.cuda.set_device(cur_device)
+                    self.model.stream_device_map[cur_device].wait_stream(prev_stream)
+                    torch.cuda.set_stream(self.model.stream_device_map[cur_device])
+                    hidden_states = hidden_states.to(
+                        self.model.transfer_map[i], non_blocking=True
+                    )
+
+                    batch.minibatch.position_ids = (
+                        batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
+                        if batch.minibatch.position_ids is not None
+                        else None
+                    )
+                hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
+                hidden_states = decode_layer.self_attn(hidden_states, self.cache, 
+                                                       position_ids=batch.minibatch.position_ids, 
+                                                       wrapper=self.wrapper, num_tokens_tensors=num_tokens_tensors, 
+                                                       page_idx=page_idx,
+                                                       page_offset=page_offset
+                                                       )
+
+                hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
+                if i < self.config.first_k_dense_replace:
+                    hidden_states = decode_layer.mlp(hidden_states, num_tokens_tensors)
+                else:
+                    hidden_states = decode_layer.mlp(hidden_states.unsqueeze(0), num_tokens_tensors, cuda_graph_idx)
+                    hidden_states = hidden_states.squeeze(0)
+        forward_batch_output = ForwardBatchOutput()
+        with torch.cuda.stream(current_stream):
+            local_logit = self.lm_head(self.model.norm(hidden_states, num_tokens_tensors, residual)[0], num_tokens_tensors)
+            forward_batch_output.logits.append(local_logit)
+
+        return forward_batch_output
+    
+
+               
+    def flash_infer_attn_plan(self, batch: ForwardBatchInput, bsz_tensors, num_tokens_tensors,
+        num_heads: int,
+        head_dim_ckv: int,
+        head_dim_kpe: int,
+        page_size: int,
+        causal: bool,
+        sm_scale: float,
+        q_data_type: torch.dtype,
+        kv_data_type: torch.dtype,):
+        minibatch = batch.minibatch
+        self.wrapper.plan(minibatch.q_indptr, minibatch.kv_indptr, minibatch.kv_indices, 
+                          minibatch.kv_len, num_heads, head_dim_ckv, head_dim_kpe, page_size, causal, sm_scale, q_data_type, kv_data_type, bsz_tensors)
+        
\ No newline at end of file
--- a/ktransformers/models/modeling_deepseek_v3.py
+++ b/ktransformers/models/modeling_deepseek_v3.py
@@ -99,6 +99,7 @@ class DeepseekV3RMSNorm(nn.Module):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
+        self.hidden_size = hidden_size

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
@@ -398,7 +399,6 @@ class MoEGate(nn.Module):
        self.n_routed_experts = config.n_routed_experts
        self.routed_scaling_factor = config.routed_scaling_factor
        self.scoring_func = config.scoring_func
-        self.seq_aux = config.seq_aux
        self.topk_method = config.topk_method
        self.n_group = config.n_group
        self.topk_group = config.topk_group
@@ -436,6 +436,7 @@ class MoEGate(nn.Module):

        ### select top-k experts
        if self.topk_method == "noaux_tc":
+            assert not self.training
            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
            group_scores = (
                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
@@ -454,7 +455,7 @@ class MoEGate(nn.Module):
                )
                .reshape(bsz * seq_len, -1)
            )  # [n, e]
-            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
            _, topk_idx = torch.topk(
                tmp_scores, k=self.top_k, dim=-1, sorted=False
            )
@@ -1933,4 +1934,4 @@ class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
-        )
\ No newline at end of file
+        )