Merge branch 'main' into temperature_top_p_from_request

26f7b4af · wang jiahao · GitHub · 07eb712a · 1f28f75f · 26f7b4af
Unverified Commit 26f7b4af authored Feb 27, 2025 by wang jiahao Committed by GitHub Feb 27, 2025
20 changed files
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
 /**
- * @Description  :  
+ * @Description  :
 * @Author       : Azure-Tang
 * @Date         : 2024-07-22 09:27:55
 * @Version      : 1.0.0
 * @LastEditors  : kkk1nak0
 * @LastEditTime : 2024-08-12 03:48:46
- * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #pragma once

@@ -13,10 +13,10 @@
 #include <torch/extension.h>
 #include <torch/torch.h>

-torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::ScalarType target_dtype);
-torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::ScalarType target_dtype);
-torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::ScalarType target_dtype);
-torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::ScalarType target_dtype);
-torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::ScalarType target_dtype);
-torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::ScalarType target_dtype);
-torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::ScalarType target_dtype);
+torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
+torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
@@ -10,6 +10,8 @@

 #include "kvcache.h"

+#include <chrono>
+
 void KVCache::attention_kvhead_(const uint16_t *q_in_data, ggml_fp16_t *output,
                                float *attn_lse, int batch_size,
                                Backend *backend) {

--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
@@ -9,6 +9,9 @@
 **/

 #include "kvcache.h"
+
+#include <chrono>
+
 void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
    // Timer start
    auto start = std::chrono::high_resolution_clock::now();

--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
@@ -10,6 +10,8 @@

 #include "kvcache.h"

+#include <chrono>
+
 void KVCache::get_anchor_one_block(ggml_fp16_t *anchor, int layer_id,
                                   int block_idx, Backend *backend) {
    // Timer start

--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
@@ -10,6 +10,8 @@

 #include "kvcache.h"

+#include <chrono>
+
 std::string ggml_type_to_string(ggml_type type) {
    switch (type) {
    case GGML_TYPE_F32:

--- a/ktransformers/ktransformers_ext/triton/fp8gemm.py
+++ b/ktransformers/ktransformers_ext/triton/fp8gemm.py
+# Adopted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+from triton import Config
+
+
+@triton.jit
+def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    """
+    Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.
+
+    Args:
+        x_ptr (triton.Pointer): Pointer to the input tensor.
+        y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
+        s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
+        BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
+
+    Returns:
+        None
+    """
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.
+    y = x / s
+    y = y.to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+
+
+def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization.
+
+    Args:
+        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `torch.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `torch.float32`.
+    """
+    assert x.is_contiguous(), 'Input tensor must be contiguous'
+    assert x.size(-1) % block_size == 0, f'Last dimension size must be divisible by block_size (block_size={block_size})'
+    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    grid = lambda meta: (triton.cdiv(x.numel(), meta['BLOCK_SIZE']), )
+    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+
+
+@triton.jit
+def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+    """
+    Dequantizes weights using the provided scaling factors and stores the result.
+
+    Args:
+        x_ptr (tl.pointer): Pointer to the quantized weights.
+        s_ptr (tl.pointer): Pointer to the scaling factors.
+        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
+        M (int): Number of rows in the weight matrix.
+        N (int): Number of columns in the weight matrix.
+        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    n = tl.cdiv(N, BLOCK_SIZE)
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs = offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+    s = tl.load(s_ptr + pid_m * n + pid_n)
+    y = x * s
+    tl.store(y_ptr + offs, y, mask=mask)
+
+
+def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    """
+    Dequantizes the given weight tensor using the provided scale tensor.
+
+    Args:
+        x (torch.Tensor): The quantized weight tensor of shape (M, N).
+        s (torch.Tensor): The scale tensor of shape (M, N).
+        block_size (int, optional): The block size to use for dequantization. Defaults to 128.
+
+    Returns:
+        torch.Tensor: The dequantized weight tensor of the same shape as `x`.
+
+    Raises:
+        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
+    """
+    assert x.is_contiguous() and s.is_contiguous(), 'Input tensors must be contiguous'
+    assert x.dim() == 2 and s.dim() == 2, 'Input tensors must have 2 dimensions'
+    M, N = x.size()
+    y = torch.empty_like(x, dtype=torch.get_default_dtype())
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
+    with torch.cuda.device(x.device):
+        weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
+    return y
+
+
+fp8_gemm_configs = [
+    Config({'BLOCK_SIZE_M': block_m, 'BLOCK_SIZE_N': block_n, 'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
+    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
+]
+
+@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
+@triton.jit
+def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
+                    a_s_ptr, b_s_ptr,
+                    M, N: tl.constexpr, K: tl.constexpr,
+                    BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr,
+                    BLOCK_SIZE_K: tl.constexpr):
+    """
+    Performs a matrix multiplication operation on FP8 matrices with scaling factors.
+
+    Args:
+        a_ptr (tl.tensor): Pointer to the first input matrix A.
+        b_ptr (tl.tensor): Pointer to the second input matrix B.
+        c_ptr (tl.tensor): Pointer to the output matrix C.
+        a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
+        b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
+        M (int): Number of rows in matrix A and C.
+        N (tl.constexpr): Number of columns in matrix B and C.
+        K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
+        BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
+        BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
+        BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    a_s_ptrs = a_s_ptr + offs_m * k
+    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(k):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
+        a_s = tl.load(a_s_ptrs)
+        b_s = tl.load(b_s_ptrs)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+        a_s_ptrs += 1
+        b_s_ptrs += 1
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+
+
+def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor):
+    """
+    Perform a matrix multiplication using FP8 precision.
+
+    Args:
+        a (torch.Tensor): The first input matrix, must be contiguous.
+        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
+        b (torch.Tensor): The second input matrix, must be contiguous.
+        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.
+
+    Returns:
+        torch.Tensor: The result of the matrix multiplication.
+    """
+    assert a.is_contiguous() and b.is_contiguous(), 'Input tensors must be contiguous'
+    assert a_s.is_contiguous() and b_s.is_contiguous(), 'Scaling factor tensors must be contiguous'
+    K = a.size(-1)
+    M = a.numel() // K
+    N = b.size(0)
+    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
+    fp8_gemm_kernel[grid](a, b, c, a_s, b_s, M, N, K)
+    return c
\ No newline at end of file
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@@ -28,7 +28,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
 from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
 from ktransformers.models.modeling_llama import LlamaForCausalLM
 from ktransformers.models.modeling_mixtral import MixtralForCausalLM
-from ktransformers.util.utils import prefill_and_generate
+from ktransformers.util.utils import prefill_and_generate, get_compute_capability
 from ktransformers.server.config.config import Config
 from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled

@@ -54,7 +54,7 @@ default_optimize_rules = {

 def local_chat(
    model_path: str | None = None,
-    optimize_rule_path: str = None,
+    optimize_config_path: str = None,
    gguf_path: str | None = None,
    max_new_tokens: int = 300,
    cpu_infer: int = Config().cpu_infer,
@@ -94,12 +94,12 @@ def local_chat(
                config, trust_remote_code=True, attn_implementation="flash_attention_2"
            )

-    if optimize_rule_path is None:
+    if optimize_config_path is None:
        if config.architectures[0] in default_optimize_rules:
            print("using default_optimize_rule for", config.architectures[0])
-            optimize_rule_path = default_optimize_rules[config.architectures[0]]
+            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
-            optimize_rule_path = input(
+            optimize_config_path = input(
                "please input the path of your rule file(yaml file containing optimize rules):"
            )

@@ -107,7 +107,7 @@ def local_chat(
        gguf_path = input(
            "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
        )
-    optimize_and_load_gguf(model, optimize_rule_path, gguf_path, config)
+    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
    
    try:
            model.generation_config = GenerationConfig.from_pretrained(model_path)
@@ -168,7 +168,7 @@ def local_chat(
            assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
            "please change max_seq_len in  ~/.ktransformers/config.yaml"
        
-        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM") and flashinfer_enabled:
+        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
            generated = prefill_and_generate(
                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think,
                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim

--- a/ktransformers/operators/attention.py
+++ b/ktransformers/operators/attention.py
@@ -16,6 +16,7 @@ from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_ro
 from typing import Optional, Tuple
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
+from ktransformers.util.utils import get_compute_capability
 import logging
 from transformers.configuration_utils import PretrainedConfig
 from transformers.cache_utils import Cache
@@ -48,12 +49,14 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                 prefill_device: str = "cuda",
                 generate_device: str = "cuda",
                 chunck_size: int = 1000,
+                 absorb_for_prefill: bool = False,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, generate_device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
        self.mla_wrapper = None
+        self.absorb_for_prefill = absorb_for_prefill

    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
@@ -242,7 +245,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
            q_nope = q_nope.transpose(1, 2)
-            assert q_nope.is_contiguous()
+            #assert q_nope.is_contiguous()
            
            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
@@ -282,6 +285,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
            attn_output = attn_output.transpose(1, 2)
            attn_output = torch.matmul(attn_output, out_absorb.mT)
+            attn_output = attn_output.transpose(1, 2)
            
            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
            attn_output = self.o_proj(attn_output)
@@ -380,7 +384,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
        # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
        
        # decode
-        if q_len == 1:
+        if q_len == 1 or self.absorb_for_prefill:
            if past_key_value is not None:
                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
                compressed_kv_with_k_pe, page_table = past_key_value.update(compressed_kv, k_pe, self.layer_idx, cache_kwargs)
@@ -395,29 +399,42 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
            q_nope = q_nope.transpose(1, 2) # q_len is 1, no GPU overhead, same below
            q_nope = torch.matmul(q_nope, q_absorb) # batched MM
            q_nope = q_nope.transpose(1, 2)
-            assert q_nope.is_contiguous()
+            q_nope = q_nope.contiguous()
+            #assert q_nope.is_contiguous()
            
            # q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
            # q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
-            q_nope.squeeze_(1)
-            q_pe.squeeze_(1)
+            q_nope.squeeze_(0)
+            q_pe.squeeze_(0)

            # flash attn doesn't support head_dim bigger than 256, use flashinfer
            if self.mla_wrapper is None:
                self.mla_wrapper = MLAWrapperSingleton.get_instance(self.device, 1, past_key_value.max_pages, use_cuda_graph = True)
-                if self.mla_wrapper.need_plan:
-                    self.mla_wrapper.need_plan = False
+            if self.mla_wrapper.need_plan:
+                self.mla_wrapper.need_plan = False
+                if q_len == 1:
                    self.mla_wrapper.plan(None,None,None,
-                                      position_ids.squeeze(1)+1,
-                                      self.num_heads,
-                                      self.kv_lora_rank,
-                                      self.qk_rope_head_dim,
-                                      past_key_value.page_size,
-                                      self.softmax_scale,
-                                      q_nope.dtype,
-                                      compressed_kv.dtype)
+                                        position_ids.squeeze(1)+1,
+                                        self.num_heads,
+                                        self.kv_lora_rank,
+                                        self.qk_rope_head_dim,
+                                        past_key_value.page_size,
+                                        self.softmax_scale,
+                                        q_nope.dtype,
+                                        compressed_kv.dtype)
+                else:
+                    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device=self.device)
+                    kv_len_arr = torch.tensor([position_ids[0, -1].item()+1], dtype=torch.int32, device=self.device)
+                    self.mla_wrapper.plan(qo_indptr,None,None,
+                                        kv_len_arr,
+                                        self.num_heads,
+                                        self.kv_lora_rank,
+                                        self.qk_rope_head_dim,
+                                        past_key_value.page_size,
+                                        self.softmax_scale,
+                                        q_nope.dtype,
+                                        compressed_kv.dtype)
            attn_output = self.mla_wrapper.run(q_nope, q_pe, compressed_kv, k_pe).view(bsz, q_len, self.num_heads, self.kv_lora_rank)
-
            """
            k = (
                torch.cat([compressed_kv, k_pe], dim=-1)
@@ -443,10 +460,11 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
            # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
            attn_output = attn_output.transpose(1, 2) # [bsz, self.num_heads, q_len, self.kv_lora_rank]
            attn_output = torch.matmul(attn_output, out_absorb.mT) # [bsz, self.num_heads, q_len, self.v_head_dim]
+            attn_output = attn_output.transpose(1, 2).contiguous() # [bsz, q_len, self.num_heads, self.kv_lora_rank]
            
            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) # [bsz, q_len, self.num_heads * self.v_head_dim]
            attn_output = self.o_proj(attn_output)
-
+            
            return attn_output, None, past_key_value
        else:
            if past_key_value is not None:
@@ -571,7 +589,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if os.name == 'nt':
+        if os.name == 'nt' or get_compute_capability()<8:
+            print("for Windows or GPU before ampere, use forward_windows")
            return self.forward_windows(
                hidden_states,
                attention_mask,

--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -245,7 +245,16 @@ class KExpertsCPU(KExpertsBase):
        down_type = None

        for key in keys:
-            if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
+            if self.gguf_loader.safetensor_loader is not None:
+                # using a temp ugly way to temprary load the tensor
+                gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy()
+                up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy()
+                down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy()
+                gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item()
+                up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item()
+                down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item()
+            
+            elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
                gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
                up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
                down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
@@ -450,9 +459,9 @@ class KExpertsTorch(KExpertsBase):
                    self.up[i] = w["up"][i, ...].to(device=device, dtype=self.dtype)
                    self.down[i] = w["down"][i, ...].to(device=device, dtype=self.dtype)
        
-        self.up = torch.cat(self.up, dim=0)
-        self.gate = torch.cat(self.gate, dim=0)
-        self.down = torch.cat(self.down, dim=0)
+        self.up = torch.stack(self.up, dim=0)
+        self.gate = torch.stack(self.gate, dim=0)
+        self.down = torch.stack(self.down, dim=0)
        return 

    def unload(self):

--- a/ktransformers/operators/flashinfer_wrapper.py
+++ b/ktransformers/operators/flashinfer_wrapper.py
@@ -9,7 +9,7 @@ flashinfer_enabled = False

 try:
    import flashinfer
-    flashinfer_enabled = False # disabled now, TODO:use new version of flashinfer and enable
+    flashinfer_enabled = True
    print("found flashinfer")
    
 except ImportError:
@@ -122,7 +122,7 @@ class MLAWrapper():
        if kv_indices is None:
            assert self.max_batch_size == 1
            kv_indices = self.kv_indices_buf
-
+        
        self.wrapper.plan(
            qo_indptr,
            kv_indptr,
@@ -132,14 +132,14 @@ class MLAWrapper():
            head_dim_ckv,
            head_dim_kpe,
            page_size,
-            False, # causal is False for decoding
+            True, # causal
            sm_scale,
            q_data_type,
            kv_data_type,
        )

    def run(self, q_nope, q_pe, ckv, k_pe, return_lse = False):
-        return self.wrapper.run(q_nope, q_pe, ckv, k_pe, return_lse)
+        return self.wrapper.run(q_nope, q_pe, ckv, k_pe, return_lse = return_lse)

 class MLAWrapperSingleton():
    wrappers:dict = {}
@@ -179,6 +179,24 @@ class MLAWrapperSingleton():
                sm_scale,
                q_data_type,
                kv_data_type,)
+            wrapper.need_plan = False
+            
+    @classmethod
+    def need_plan_all(cls):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.need_plan = True
+        
+    @classmethod
+    def reset_buffer(cls):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.qo_indptr_buf[1] = 1 # assert max_batch_size=1 here.
+            
+    @classmethod
+    def update_buffer(cls, max_pages):
+        for device, wrapper in cls.wrappers.items():
+            wrapper.kv_indptr_buf[1] = max_pages # assert max_batch_size=1 here.
+            wrapper.kv_indices_buf = torch.arange(0, max_pages, dtype=torch.int32, device=device)
+            wrapper.wrapper._kv_indices_buf = wrapper.kv_indices_buf
            

 if __name__ == "__main__":
@@ -187,8 +205,9 @@ if __name__ == "__main__":
    page_size = 64
    num_heads = 128

-    q_nope = torch.randn((1, num_heads, 512), dtype=torch.bfloat16, device="cuda")
-    q_pe = torch.randn((1, num_heads, 64), dtype=torch.bfloat16, device="cuda")
+    q_len = 10
+    q_nope = torch.randn((q_len, num_heads, 512), dtype=torch.bfloat16, device="cuda")
+    q_pe = torch.randn((q_len, num_heads, 64), dtype=torch.bfloat16, device="cuda")
    ckv = torch.randn((max_pages, page_size, 512), dtype=torch.bfloat16, device="cuda")
    k_pe = torch.randn((max_pages, page_size, 64), dtype=torch.bfloat16, device="cuda")
    
@@ -199,10 +218,10 @@ if __name__ == "__main__":
        max_pages,
    )
    
-    kv_len_arr = torch.tensor([10], dtype=torch.int32, device="cuda")
-
+    kv_len_arr = torch.tensor([q_len], dtype=torch.int32, device="cuda")
+    qo_indptr = torch.tensor([0, q_len], dtype=torch.int32, device="cuda")
    wrapper.plan(
-        None,
+        qo_indptr,
        None,
        None,
        kv_len_arr,
@@ -216,6 +235,7 @@ if __name__ == "__main__":
    )

    attn_output = wrapper.run(q_nope, q_pe, ckv, k_pe)
+    print(attn_output.shape)

    k = (
        torch.cat([ckv, k_pe], dim=-1)
@@ -235,6 +255,7 @@ if __name__ == "__main__":
        False,
        192 ** (-0.5)
    )
+    print(attn_ref.shape)
    
    torch.testing.assert_close(attn_output, attn_ref, rtol=1e-3, atol=1e-3)
    print("test past")
\ No newline at end of file
--- a/ktransformers/operators/gate.py
+++ b/ktransformers/operators/gate.py
@@ -67,7 +67,14 @@ class KMoEGateBase(ABC):

        for key in keys:
            key = ".".join(key.split(".")[:-1])
-            if key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
+            if self.gguf_loader.safetensor_loader is not None:
+                targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
+                weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight") 
+                e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias")
+                weight_type = weight.dtype
+                e_score_correction_bias_type = e_score_correction_bias.dtype
+                res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias,  "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
+            elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
                targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
                tensors = self.load_multi(key, targets, device=device)
                weight = tensors[".ffn_gate_inp.weight"]
@@ -116,8 +123,8 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
            self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
        else:
            raise ValueError("Invalid weight type")
-        self.orig_module.weight = self.orig_module.weight.to(device)
-        self.orig_module.e_score_correction_bias = self.orig_module.e_score_correction_bias.to(device)
+        self.orig_module.weight = nn.Parameter(self.orig_module.weight.to(device))
+        self.orig_module.e_score_correction_bias = nn.Parameter(self.orig_module.e_score_correction_bias.to(device))

    def unload(self):
        if self.weight is not None:

--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@@ -26,6 +26,7 @@ from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marl
 )
 from ktransformers.operators.base_operator import BaseInjectedModule
 from transformers.configuration_utils import PretrainedConfig
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
 from abc import ABC, abstractmethod
 import sys, os
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build"))
@@ -78,7 +79,13 @@ class KLinearBase(ABC):
            keys = [self.key]

        for key in keys:
-            if key + ".weight" in self.gguf_loader.tensor_file_map:
+            if self.gguf_loader.safetensor_loader is not None:
+                # using safetensor_loader
+                tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight')
+                weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv')
+                return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
+                
+            elif key + ".weight" in self.gguf_loader.tensor_file_map:
                if key + ".bias" in self.gguf_loader.tensor_file_map:
                    tensors = self.load_multi(key, ["weight", "bias"], device=device)
                    tensor = tensors["weight"]
@@ -169,7 +176,61 @@ class KLinearTorch(KLinearBase):
        if self.has_bias:
            self.bias = None

-
+class KLinearFP8(KLinearBase):
+    # this kernel requires special handling for weight
+    # Please load the weight file downloaded from KVCache.AI
+    marlin_q_w: torch.Tensor
+    marlin_s: torch.Tensor
+    g_idx: torch.Tensor
+    sort_indices: torch.Tensor
+    has_bias: bool
+    weight: torch.Tensor
+    scale_w: torch.Tensor
+    bias: torch.Tensor
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module = None,
+        device: str = "cuda",
+        block_size: int = 128,
+        **kwargs,
+    ):
+        super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        self.has_bias = False
+        self.dtype = torch.get_default_dtype()
+        self.block_size = block_size
+    
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.to(self.device)
+        orig_dtype = x.dtype        
+        x_quantized, scale_x = act_quant(x, self.block_size)
+        y = fp8_gemm(x_quantized, scale_x, self.weight, self.weight_scale_inv)
+        return y.to(dtype=orig_dtype)
+    
+    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
+        if device is None: device = self.device
+        if w is None: 
+            w = self.load_weight(device=device) 
+        ### TODO fit weight_inv format
+        if isinstance(w, tuple):
+            self.weight = w[0].to(device)
+            self.weight_scale_inv = w[1].to(device)
+            self.has_bias = False
+        else:
+            raise ValueError("Invalid weight type")
+        self.weight = self.weight.to(device)
+        if self.has_bias:
+            self.bias = self.bias.to(device)
+        
+    def unload(self):
+        if self.weight is not None:
+            self.weight = None
+        if self.has_bias:
+            self.bias = None
+        
+        
 class KLinearMarlin(KLinearBase):
    marlin_q_w: torch.Tensor
    marlin_s: torch.Tensor
@@ -404,7 +465,8 @@ class KLinearCPUInfer(KLinearBase):
 LINEAR_MAP = {
    "KLinearMarlin": KLinearMarlin,
    "KLinearTorch": KLinearTorch,
-    "KLinearCPUInfer": KLinearCPUInfer
+    "KLinearCPUInfer": KLinearCPUInfer,
+    "KLinearFP8": KLinearFP8,
 }

 class KTransformersLinear(BaseInjectedModule, KLinearBase):
@@ -440,10 +502,11 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
    def forward(self, x):
        if self.mode == InferenceState.PREFILL:
            assert self.prefill_linear is not None, "cpu linear is not initialized"
-            return self.prefill_linear.forward(x)
+            y = self.prefill_linear.forward(x)
        else:
            assert self.generate_linear is not None, "gpu linear is not initialized"
-            return self.generate_linear.forward(x)
+            y = self.generate_linear.forward(x)
+        return y

    def load(self, w: dict | nn.Parameter | tuple | None = None, mode: InferenceState = InferenceState.GENERATE):
        if not mode:

--- a/ktransformers/operators/models.py
+++ b/ktransformers/operators/models.py
@@ -56,7 +56,7 @@ from ktransformers.models.modeling_deepseek import (
 from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
 from ktransformers.models.configuration_llama import LlamaConfig
 from ktransformers.operators.base_operator import BaseInjectedModule
-from ktransformers.util.utils import InferenceState
+from ktransformers.util.utils import InferenceState, get_compute_capability
 from ktransformers.util.custom_gguf import GGUFLoader
 from transformers.configuration_utils import PretrainedConfig
 from ktransformers.models.modeling_llama import (
@@ -649,9 +649,14 @@ class KDeepseekV2Model(BaseInjectedModule):
        if per_layer_prefill_flag:
            causal_mask = None
        else:
-            causal_mask = self._update_causal_mask(
-                attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-            )
+            if os.name == 'nt' or get_compute_capability()<8:
+                print("for Windows or GPU before ampere, use forward_windows")
+                # only use mask in forward windows or can't flash attn
+                causal_mask = self._update_causal_mask(
+                    attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+                )
+            else:
+                causal_mask = None

        # embed positions
        hidden_states = inputs_embeds

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
\ No newline at end of file
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
@@ -293,6 +293,7 @@
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
+      absorb_for_prefill: False

 # GPU 1: layers 15–29
 - match:
@@ -302,6 +303,7 @@
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
+      absorb_for_prefill: False

 # GPU 2: layers 30–44
 - match:
@@ -311,6 +313,7 @@
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
+      absorb_for_prefill: False

 # GPU 3: layers 45–60
 - match:
@@ -320,6 +323,7 @@
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
+      absorb_for_prefill: False

 # === Overall Model Replacement with Transfer Map ===


--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+  
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        30: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
@@ -168,5 +168,5 @@
  replace:
    class: "default"
    kwargs:
-      generate_device: "cuda:0"
-      prefill_device: "cuda:0"
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
@@ -60,6 +60,7 @@
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
 - match:
    name: "^model$"
  replace:

--- a/ktransformers/server/api/ollama/completions.py
+++ b/ktransformers/server/api/ollama/completions.py
@@ -12,8 +12,8 @@ from ktransformers.server.config.config import Config
 from ktransformers.server.utils.create_interface import get_interface
 from ktransformers.server.schemas.assistants.streaming import check_link_response
 from ktransformers.server.backend.base import BackendInterfaceBase
-router = APIRouter(prefix='/api')

+router = APIRouter(prefix='/api')

 # https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
 class OllamaGenerateCompletionRequest(BaseModel):
@@ -40,61 +40,121 @@ class OllamaGenerateCompletionRequest(BaseModel):
    keep_alive: Optional[str] = Field(
        "5m", description="Controls how long the model will stay loaded into memory following the request.")

-
 class OllamaGenerationStreamResponse(BaseModel):
    model: str
    created_at: str
    response: str
    done: bool = Field(...)

-
 class OllamaGenerationResponse(BaseModel):
    pass

-
 @router.post("/generate", tags=['ollama'])
 async def generate(request: Request, input: OllamaGenerateCompletionRequest):
    id = str(uuid4())
-
    interface: BackendInterfaceBase = get_interface()
    print(f'COMPLETION INPUT:----\n{input.prompt}\n----')
-
    config = Config()

    if input.stream:
        async def inner():
-            async for token in interface.inference(input.prompt,id): 
-                d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response=token,done=False)
-                yield d.model_dump_json()+'\n' 
-                # d = {'model':config.model_name,'created_at':"", 'response':token,'done':False}
-                # yield f"{json.dumps(d)}\n"
-            # d = {'model':config.model_name,'created_at':"", 'response':'','done':True}
-            # yield f"{json.dumps(d)}\n"
-            d = OllamaGenerationStreamResponse(model=config.model_name,created_at=str(datetime.now()),response='',done=True)   
-            yield d.model_dump_json()+'\n'
-        return check_link_response(request,inner())
+            async for token in interface.inference(input.prompt, id):
+                d = OllamaGenerationStreamResponse(
+                    model=config.model_name,
+                    created_at=str(datetime.now()),
+                    response=token,
+                    done=False
+                )
+                yield d.model_dump_json() + '\n'
+            d = OllamaGenerationStreamResponse(
+                model=config.model_name,
+                created_at=str(datetime.now()),
+                response='',
+                done=True
+            )
+            yield d.model_dump_json() + '\n'
+        return check_link_response(request, inner())
    else:
        raise NotImplementedError

 # https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion
-
+class OllamaChatCompletionMessage(BaseModel):
+    role: str
+    content: str

 class OllamaChatCompletionRequest(BaseModel):
-    pass
-
+    model: str = Field(..., description="The model name, which is required.")
+    messages: List[OllamaChatCompletionMessage] = Field(
+        ..., description="A list of messages to generate a response for.")
+    stream: bool = Field(True, description="If true, the response will be streamed.")

 class OllamaChatCompletionStreamResponse(BaseModel):
-    pass
+    model: str
+    created_at: str
+    message: dict
+    done: bool = Field(...)
+    total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
+    load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
+    prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
+    prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
+    eval_count: Optional[int] = Field(None, description="Number of tokens generated")
+    eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")
+


 class OllamaChatCompletionResponse(BaseModel):
    pass

-
 @router.post("/chat", tags=['ollama'])
 async def chat(request: Request, input: OllamaChatCompletionRequest):
-    raise NotImplementedError
+    id = str(uuid4())
+    interface: BackendInterfaceBase = get_interface()
+    config = Config()

+    # 将消息转换为提示字符串
+    prompt = ""
+    for msg in input.messages:
+        prompt += f"{msg.role}: {msg.content}\n"
+    prompt += "assistant:"
+
+    if input.stream:
+        async def inner():
+            start_time = time()  # 记录开始时间（秒）
+            eval_count = 0  # 统计生成的 token 数量
+            tokens = []
+
+            async for token in interface.inference(prompt, id):
+                d = OllamaChatCompletionStreamResponse(
+                    model=config.model_name,
+                    created_at=str(datetime.now()),
+                    message={"role": "assistant", "content": token}, 
+                    done=False
+                )
+                yield d.model_dump_json() + '\n'
+            # 计算性能数据
+            end_time = time()
+            total_duration = int((end_time - start_time) * 1_000_000_000)  # 转换为纳秒
+            prompt_eval_count = len(prompt.split())  # 简单估算提示词数量
+            eval_duration = total_duration  # 假设全部时间用于生成（简化）
+            prompt_eval_duration = 0  # 假设无单独提示评估时间
+            load_duration = 0  # 假设加载时间未知
+
+            d = OllamaChatCompletionStreamResponse(
+                model=config.model_name,
+                created_at=str(datetime.now()),
+                message={},
+                done=True,
+                total_duration=total_duration,
+                load_duration=load_duration,
+                prompt_eval_count=prompt_eval_count,
+                prompt_eval_duration=prompt_eval_duration,
+                eval_count=eval_count,
+                eval_duration=eval_duration
+            )
+            yield d.model_dump_json() + '\n'
+        return check_link_response(request, inner())
+    else:
+        raise NotImplementedError("Non-streaming chat is not implemented.")

 # https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models
 class OllamaModel(BaseModel):
@@ -103,9 +163,8 @@ class OllamaModel(BaseModel):
    size: int
    # TODO: fill the rest correctly

-
 # mock ollama
-@router.get("/tags",tags=['ollama'])
+@router.get("/tags", tags=['ollama'])
 async def tags():
    config = Config()
    # TODO: fill this correctly, although it does not effect Tabby
@@ -138,25 +197,21 @@ class OllamaShowResponse(BaseModel):
    class Config:
        protected_namespaces = ()

-
-
 @router.post("/show", tags=['ollama'])
 async def show(request: Request, input: OllamaShowRequest):
    config = Config()
    # TODO: Add more info in config to return, although it does not effect Tabby
    return OllamaShowResponse(
-        modelfile = "# Modelfile generated by ...",
-        parameters = " ",
-        template = " ",
-        details = OllamaShowDetial(
-            parent_model = " ",
-            format = "gguf",
-            family = " ",
-            families = [
-                " " 
-            ],
-            parameter_size = " ",
-            quantization_level = " "
+        modelfile="# Modelfile generated by ...",
+        parameters=" ",
+        template=" ",
+        details=OllamaShowDetial(
+            parent_model=" ",
+            format="gguf",
+            family=" ",
+            families=[" "],
+            parameter_size=" ",
+            quantization_level=" "
        ),
-        model_info = OllamaModelInfo()
+        model_info=OllamaModelInfo()
    )
\ No newline at end of file
--- a/ktransformers/server/api/openai/endpoints/chat.py
+++ b/ktransformers/server/api/openai/endpoints/chat.py
@@ -25,6 +25,9 @@ async def chat_completion(request:Request,create:ChatCompletionCreate):

    input_message = [json.loads(m.model_dump_json()) for m in create.messages]

+    if Config().api_key != '':
+        assert request.headers.get('Authorization', '').split()[-1] == Config().api_key
+
    if create.stream:
        async def inner():
            chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time()))