Merge pull request #663 from kvcache-ai/develop-0.2.2

[release] Release 0.2.2rc.

Merge pull request #663 from kvcache-ai/develop-0.2.2
[release] Release 0.2.2rc.
8333a4d8 · Azure · GitHub · 7b2a6690 · c6e4e1c3 · 8333a4d8
Unverified Commit 8333a4d8 authored Feb 25, 2025 by Azure Committed by GitHub Feb 25, 2025
14 changed files
--- a/ktransformers/operators/models.py
+++ b/ktransformers/operators/models.py
@@ -56,7 +56,7 @@ from ktransformers.models.modeling_deepseek import (
 from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
 from ktransformers.models.configuration_llama import LlamaConfig
 from ktransformers.operators.base_operator import BaseInjectedModule
-from ktransformers.util.utils import InferenceState
+from ktransformers.util.utils import InferenceState, get_compute_capability
 from ktransformers.util.custom_gguf import GGUFLoader
 from transformers.configuration_utils import PretrainedConfig
 from ktransformers.models.modeling_llama import (
@@ -649,9 +649,14 @@ class KDeepseekV2Model(BaseInjectedModule):
        if per_layer_prefill_flag:
            causal_mask = None
        else:
-            causal_mask = self._update_causal_mask(
-                attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-            )
+            if os.name == 'nt' or get_compute_capability()<8:
+                print("for Windows or GPU before ampere, use forward_windows")
+                # only use mask in forward windows or can't flash attn
+                causal_mask = self._update_causal_mask(
+                    attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+                )
+            else:
+                causal_mask = None

        # embed positions
        hidden_states = inputs_embeds

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
\ No newline at end of file
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
@@ -293,6 +293,7 @@
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
+      absorb_for_prefill: False

 # GPU 1: layers 15–29
 - match:
@@ -302,6 +303,7 @@
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
+      absorb_for_prefill: False

 # GPU 2: layers 30–44
 - match:
@@ -311,6 +313,7 @@
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
+      absorb_for_prefill: False

 # GPU 3: layers 45–60
 - match:
@@ -320,6 +323,7 @@
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
+      absorb_for_prefill: False

 # === Overall Model Replacement with Transfer Map ===


--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\."
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      generate_op: "KLinearFP8"
+      prefill_op: "KLinearTorch"
+  
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:0"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:0"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda:1"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op:  "KExpertsCPU"
+      out_device: "cuda:1"
+  recursive: False # don't recursively inject submodules of this module
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+      transfer_map: 
+        30: "cuda:1"
+
+- match:
+    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+
+- match:
+    name: "^lm_head"
+    class: torch.nn.Linear
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
+
+
+- match:
+    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda:1"
+      prefill_device: "cuda:1"
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
@@ -60,6 +60,7 @@
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
 - match:
    name: "^model$"
  replace:

--- a/ktransformers/server/backend/interfaces/ktransformers.py
+++ b/ktransformers/server/backend/interfaces/ktransformers.py
@@ -14,6 +14,7 @@ from ktransformers.models.custom_cache import StaticCache
 from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
 from ktransformers.local_chat import custom_models, default_optimize_rules
 from ktransformers.util.utils import get_device
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton


 warm_uped = False
@@ -35,9 +36,9 @@ class KTransformersInterface(TransformersInterface):
        with torch.device("meta"):
            self.model = custom_models[config.architectures[0]](config)
        if default_args.optimize_config_path is None:
-            optimize_rule_path = default_optimize_rules[config.architectures[0]]
+            optimize_config_path = default_optimize_rules[config.architectures[0]]
        else:
-            optimize_rule_path = args.optimize_config_path
+            optimize_config_path = args.optimize_config_path

        # print(optimize_config)

@@ -47,7 +48,7 @@ class KTransformersInterface(TransformersInterface):
                "please input the path of your gguf file(gguf file in the dir containing input gguf file must all"
                " belong to current model):"
            )
-        optimize_and_load_gguf(self.model, optimize_rule_path, gguf_path, config)
+        optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config)

        self.device_map = self.model.gguf_loader.tensor_device_map
        # logger.info(f"{args.model_name} loaded from {args.model_dir} to {self.device_map}")
@@ -186,6 +187,8 @@ class KTransformersInterface(TransformersInterface):
            input_ids = input_ids.to("cpu")
        inputs_embeds = self.model.model.embed_tokens(input_ids).to(device)
        torch.cuda.set_device(device)
+        if flashinfer_enabled:
+            MLAWrapperSingleton.need_plan_all()
        if self.use_static_cache:
            logits = self.model(
                inputs_embeds=inputs_embeds,
@@ -198,6 +201,8 @@ class KTransformersInterface(TransformersInterface):
        else:
            logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0]

+        if flashinfer_enabled:
+            MLAWrapperSingleton.reset_buffer()
        self.prepare_logits_wrapper(input_ids, device)
        next_token = self.logits_to_token(logits[0, -1, :])
        yield self.append_new_tokens(next_token)

--- a/ktransformers/server/backend/interfaces/transformers.py
+++ b/ktransformers/server/backend/interfaces/transformers.py
@@ -333,14 +333,14 @@ class TransformersInterface(BackendInterfaceBase):
        for i in range(1, self.args.max_new_tokens):
            
            with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
-                if i > 1 and flashinfer_enabled:
+                if flashinfer_enabled:
                    MLAWrapperSingleton.plan_all(None,None,None,self.active_cache_position.to(torch.int32)+1,
                                             num_heads=self.model.config.num_attention_heads, head_dim_ckv=self.model.config.kv_lora_rank, 
                                             head_dim_kpe=self.model.config.qk_rope_head_dim, page_size=self.cache.page_size,
                                             sm_scale=(self.model.config.qk_rope_head_dim + self.model.config.qk_nope_head_dim) ** (-0.5), q_data_type=torch.bfloat16, kv_data_type=torch.bfloat16)
                next_token = self.decode_one_tokens()
                self.profiler.inc("decode")
-                if next_token == self.tokenizer.eos_token_id:
+                if next_token == self.tokenizer.eos_token_id or "<|im_end|>" == self.tokenizer.decode(next_token):
                    assert self.args.batch_size == 1
                    break
                yield self.append_new_tokens(next_token)

--- a/ktransformers/tests/mmlu_pro_test.py
+++ b/ktransformers/tests/mmlu_pro_test.py
@@ -173,8 +173,8 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="API Generate Tester")
    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
    parser.add_argument("--file", type=str, default="TIGER-Lab/MMLU-Pro", help="Path to the mmlu.jsonl file")
-    parser.add_argument("--result", type=str, default="./mmlu_pro.json", help="Path to save the result JSON file")
-    parser.add_argument("--log", type=str, default="./mmlu_pro.log", help="Path to save the log file")
+    parser.add_argument("--result", type=str, default="./mmlu_result_pro.json", help="Path to save the result JSON file")
+    parser.add_argument("--log", type=str, default="./mmlu_result_pro.log", help="Path to save the log file")
    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
    parser.add_argument("--api_url", type=str, default="http://localhost:15488/v1/chat/completions", help="API URL")
    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")

--- a/ktransformers/tests/triton_fp8gemm_test.py
+++ b/ktransformers/tests/triton_fp8gemm_test.py
+import torch
+import torch.nn.functional as F
+from typing import Optional
+import pytest
+from typing import Tuple, Optional, Literal
+import time
+# use dir path
+import os
+import sys
+sys.path.insert(0, "/home/azure/ktransformers")
+print(sys.path)
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
+from safetensors import safe_open
+
+world_size = 1
+rank = 0
+block_size = 128
+gemm_impl: Literal["bf16", "fp8"] = "bf16"
+# Assuming `fp8_gemm`, `act_quant`, `weight_dequant` and other relevant functions are already defined
+
+def test_fp8_gemm_vs_torch_matmul():
+    # Test case 1: Create random matrices of size (M, K) and (K, N)
+    M, K, N = 64, 128, 256  # Matrix dimensions
+    x = torch.randn(M, K, dtype=torch.bfloat16, device='cuda')
+    weight = torch.randn(N, K, dtype=torch.bfloat16, device='cuda')
+
+    # Apply act_quant to both matrices
+    x_quantized, scale_x = act_quant(x, block_size)
+    weight_quantized, scale_w = act_quant(weight, block_size)
+    
+    # mk continous
+    x_quantized = x_quantized.contiguous()
+    weight_quantized = weight_quantized.contiguous()
+    scale_x = scale_x.contiguous()
+    scale_w = scale_w.contiguous()
+
+    # Perform fp8_gemm using the quantized tensors
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight_quantized, scale_w)
+
+    # Perform torch.matmul using the original floating point tensors
+    result_torch_matmul = torch.matmul(x, weight.T)
+    print(f'result_torch_matmul: {result_torch_matmul.shape}')
+    print(f'result_fp8_gemm: {result_fp8_gemm.shape}')
+
+    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
+    print(f"result_torch_matmul:\n {result_torch_matmul}")
+    
+def test_fp8_gemm_vs_torch_matmul_load():
+    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
+    with safe_open(file_path, framework="pt", device=0) as f:
+        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
+        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
+
+    # weight_dequant
+    weight_dequantized = weight_dequant(weight, scale)
+    print(f"weight_dequantized: {weight_dequantized.shape}")
+    N, K = weight_dequantized.shape
+    M = 64
+    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
+    x_quantized, scale_x = act_quant(x, block_size)
+    
+    # Test case 1: quantized x matmal with undequantized weight
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
+    print(f"dtype {result_fp8_gemm.dtype}")
+
+    # Perform torch.matmul using the original floating point tensors
+    result_torch_matmul = torch.matmul(x, weight_dequantized.to(torch.bfloat16).T)
+    print(f"result_torch_matmul:\n {result_torch_matmul}")
+
+def test_fp8_gemm_tplops():
+    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
+    with safe_open(file_path, framework="pt", device=0) as f:
+        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
+        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
+
+    # weight_dequant
+    weight_dequantized = weight_dequant(weight, scale)
+    print(f"weight_dequantized: {weight_dequantized.shape}")
+    N, K = weight_dequantized.shape
+    M = 6400
+    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
+    # x_quantized, scale_x = act_quant(x, block_size)
+    
+    # Calculate time for 1000 fp8_gemm
+    i = 10
+    flops_per_gemm = 2 * M * N * K
+    total_flops = i * flops_per_gemm
+    
+    x_quantized, scale_x = act_quant(x, block_size)
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    x_quantized, scale_x = act_quant(x, block_size)
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+
+    
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for i in range(i):
+        x_quantized, scale_x = act_quant(x, block_size)
+        result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    
+    total_time = t1 - t0
+    tflops = total_flops / total_time / 1e12
+    print(f"total_time: {total_time}")
+    print(f"tflops: {tflops}")
+    
+
+    
+    
+if __name__ == "__main__":
+    test_fp8_gemm_vs_torch_matmul()
+    test_fp8_gemm_vs_torch_matmul_load()
+    test_fp8_gemm_tplops()
+    
\ No newline at end of file
--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@@ -25,6 +25,7 @@ import os
 from enum import IntEnum
 import torch
 import KTransformersOps
+from .custom_loader import SafeTensorLoader
 import ctypes

 class GGMLQuantizationType(IntEnum):
@@ -128,6 +129,7 @@ GGML_BLOCK_SIZES = {
    "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
    "Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2,
    "IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64,
+    "FP8": 1,
 }

 GGML_ELEMENTS_PER_BLOCK = {
@@ -143,6 +145,7 @@ GGML_ELEMENTS_PER_BLOCK = {
    "Q5_K": 256,
    "Q6_K": 256,
    "IQ4_XS": 256,
+    "FP8": 1,
 }

 DATA_TYPES = {
@@ -159,6 +162,7 @@ DATA_TYPES = {
    "uint64": 10,
    "int64": 11,
    "float64": 12,
+    "FP8": 13,
 }

 class GGUFLoader:
@@ -166,12 +170,15 @@ class GGUFLoader:
    gguf_path: str
    tensor_file_map: dict # {tensor_name: tensor_file_path}
    gguf_file_meta: dict
+    safetensor_loader: SafeTensorLoader
    def __init__(self, gguf_path: str):
        # Check dir exist
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
        if os.path.isfile(gguf_path):
            gguf_path = os.path.dirname(gguf_path)
+
+        self.safetensor_loader = None
        
        self.tensor_info = {}
        self.gguf_path = gguf_path
@@ -179,7 +186,13 @@ class GGUFLoader:
        self.file_data_map = {}
        self.gguf_file_meta = {}
        self.tensor_device_map = {}
-        
+
+        # I know this is ugly, but I don't want to change the original code too much
+        # TODO: merge gguf load and other loads.
+        safetensor_loader = SafeTensorLoader(gguf_path)
+        if safetensor_loader.tensor_file_map:
+            self.safetensor_loader = safetensor_loader
+            return
        # Walk through all the .gguf files in the directory
        found_gguf = False
        for root, dirs, files in os.walk(gguf_path):
@@ -286,6 +299,13 @@ class GGUFLoader:
        itemsize = int(np.empty([], dtype = item_type).itemsize)
        return mmap_data[offset : offset + itemsize * item_count]
    
+    def get_undequanted_tensor_and_ggml_type(self, name):
+        t = self.tensor_info[name]
+        data = self.get_mmap_tensor(name)
+        ggml_type = t["ggml_type"]
+        data = torch.from_numpy(data)
+        return data, ggml_type
+
    def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
        t = self.tensor_info[name]
        if device.lower() == "cpu":
@@ -420,6 +440,9 @@ def read_value(f, data_type):
        elem_type, count = struct.unpack("<IQ", f.read(4 + 8))
        return [read_value(f, elem_type) for _ in range(count)]

+    elif data_type == DATA_TYPES["FP8"]:
+        return struct.unpack("<B", f.read(1))[0]
+
    else:
        raise NotImplementedError(f"Data type {data_type} not implemented")


--- a/ktransformers/util/custom_loader.py
+++ b/ktransformers/util/custom_loader.py
+import struct
+import warnings
+import numpy as np
+import re
+import numpy.typing as npt
+from typing import Sequence
+import os
+from enum import IntEnum
+import torch
+import KTransformersOps
+from safetensors import safe_open
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
+from safetensors.torch import save_file
+
+class SafeTensorLoader:
+    tensor_file_map = {}
+    tensor_type_map = {}
+    file_handle_map = {}
+    
+    def __init__(self, file_path: str):
+        self.__load_tensor_file_map(file_path)
+
+    def __load_tensor_file_map(self, file_path: str):
+        # 处理传入路径，确保是文件夹路径
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Path not found: {file_path}")
+        if os.path.isfile(file_path):
+            folder_path = os.path.dirname(file_path)
+        else:
+            folder_path = file_path
+
+        found_safetensor = False
+        for root, _, files in os.walk(folder_path):
+            files = sorted(files)
+            for file in files:
+                if file.endswith(".safetensors"):
+                    found_safetensor = True
+                    file_path = os.path.join(root, file)
+                    if file not in self.file_handle_map:
+                        try:
+                            handle = safe_open(file_path, framework="pt")
+                            self.file_handle_map[file] = handle
+                        except Exception as e:
+                            print(f"Error opening Safetensor file {file_path}: {e}")
+                            continue
+
+                    f = self.file_handle_map.get(file)
+                    if f is None:
+                        continue
+                    try:
+                        for key in f.keys():
+                            self.tensor_file_map[key] = file
+                    except Exception as e:
+                        print(f"Error reading Safetensor file {file_path}: {e}")
+
+        # if not found_safetensor:
+        #     raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
+
+    def load_tensor(self, key: str, device: str="cpu"):
+        if key not in self.tensor_file_map:
+            raise KeyError(f"Key {key} not found in Safetensor files")
+        file = self.tensor_file_map[key]
+        f = self.file_handle_map.get(file)
+        if f is None:
+            raise FileNotFoundError(f"File {file} not found in Safetensor files")
+        tensor = f.get_tensor(key)
+        return tensor.to(device)
+
+    def close_all_handles(self):
+        for handle in self.file_handle_map.values():
+            handle.close()
+        self.file_handle_map.clear()
+
+    def load_dequantized_tensor(self, key:str, device: str="cpu"):
+        if key not in self.tensor_file_map:
+            raise KeyError(f"Key {key} not found in Safetensor files")
+        file = self.tensor_file_map[key]
+        f = self.file_handle_map.get(file)
+        if f is None:
+            raise FileNotFoundError(f"File {file} not found in Safetensor files")
+        tensor = f.get_tensor(key).to(device)
+        if key.endswith(".weight"):
+            if key[:-7] + ".weight_scale_inv" in self.tensor_file_map:
+                weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
+                tensor = weight_dequant(tensor, weight_scale_inv)
+        return tensor.to(device)
\ No newline at end of file
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@@ -21,6 +21,18 @@ from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton

 warm_uped = False

+def get_compute_capability(device:torch.device = None):
+    if torch.cuda.is_available():
+        if device is None:
+            num_gpus = torch.cuda.device_count()
+            min_compute_capability_major = 100
+            for gpu_id in range(num_gpus):
+                gpu_props = torch.cuda.get_device_properties(gpu_id)
+                min_compute_capability_major = min(min_compute_capability_major, gpu_props.major)
+            return min_compute_capability_major
+        else:
+            return torch.cuda.get_device_properties(device)
+
 def set_module(model, submodule_key, module):
    tokens = submodule_key.split('.')
    sub_tokens = tokens[:-1]
@@ -66,12 +78,23 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
    for name, param in local_state.items():
        key = prefix + name
        translated_key = translate_name_to_gguf(key)
-        if translated_key in gguf_loader.tensor_file_map:
+        
+        # TODO: Merge all loader.
+        # I know this is ugly but lets do it for now.
+        if gguf_loader.safetensor_loader is not None:
+            load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
+            tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
+        else:
+            load_dequantized_tensor = gguf_loader.load_gguf_tensor
+            tensor_file_map = gguf_loader.tensor_file_map
+        
+        if translated_key in tensor_file_map:
            target_dtype = torch.get_default_dtype()
            device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
            print(f"loading {translated_key} to {device}")
-            # device = "cpu" if "embd" in translated_key else "cuda"
-            weights = gguf_loader.load_gguf_tensor(translated_key, device = device).to(dtype = target_dtype)
+            torch.cuda.empty_cache() # To fit in 16G VRAM. By "wkGCaSS - 知乎 https://zhuanlan.zhihu.com/p/25491611225"
+            # weights = gguf_loader.load_gguf_tensor(translated_key, device = device).to(dtype = target_dtype)
+            weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
            set_param(module, name, weights)
            del weights
        else:
@@ -153,6 +176,10 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
        else:
            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
+        if use_flashinfer_mla:
+            MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
+            MLAWrapperSingleton.need_plan_all()
+            
        logits = model(
            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
@@ -175,6 +202,9 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)
        first_token_time = time.time() - start_time
+        
+        if use_flashinfer_mla:
+            MLAWrapperSingleton.reset_buffer()

        prefill_count = seq_length
        prefill_time = first_token_time
@@ -192,15 +222,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            
        start_time = time.time()
        for i in range(1, max_new_tokens):
+            if use_flashinfer_mla:
+                MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,
+                                             num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
+                                             q_head_dim ** (-0.5), torch.bfloat16, torch.bfloat16)
            global warm_uped
            if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
                warm_uped = True
                cuda_graph_runner = CUDAGraphRunner()
                cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
-            if i > 1 and use_flashinfer_mla:
-                MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,
-                                             num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
-                                             q_head_dim ** (-0.5), torch.bfloat16, torch.bfloat16)
            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
            inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
            generated_ids[:, cache_position] = next_token.int()

--- a/merge_tensors/merge_safetensor_gguf.py
+++ b/merge_tensors/merge_safetensor_gguf.py
+# this script targets to merge the fp8 safe tensor and the gguf quantized tensors.
+
+import os
+# insert the path of the project
+import sys
+sys.path.insert(0, "/home/azure/ktransformers")
+import argparse
+import torch
+from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
+from safetensors import safe_open
+from safetensors.torch import save_file
+import re
+from collections import defaultdict
+
+def read_safetensor_keys_from_folder(folder_path)->dict:
+    """    
+    :param folder_path: folder path
+    :return: key_to_file_map
+    """
+    # check if the folder path is exist
+    if not os.path.exists(folder_path):
+        raise FileNotFoundError(f"GGUF dir not found: {folder_path}")
+    if os.path.isfile(folder_path):
+        folder_path = os.path.dirname(folder_path)
+    
+    key_to_file_map = {}
+
+    found_safetensor = False
+    for root, dirs, files in os.walk(folder_path):
+        # sort files
+        files = sorted(files)
+        for file in files:
+            if file.endswith(".safetensors"):
+                found_safetensor = True
+                file_path = os.path.join(root, file)
+                try:
+                    with safe_open(file_path, framework="pt") as f:
+                        for key in f.keys():
+                            if "model.layers.61" in key:
+                                # skip MTP layer
+                                continue
+                            # try:
+                            #     if int(key.split('.')[2]) > 4:
+                            #         continue
+                            # except:
+                            #     pass
+                            key_to_file_map[key] = file_path
+                except Exception as e:
+                    print(f"Error reading Safetensor file {file_path}: {e}")
+    
+    if not found_safetensor:
+        raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
+    
+    return key_to_file_map
+
+tensor_from_gguf = [] # todo: add keys in gguf that should be used in the final tensor
+
+def translate_name(name:str)->str:
+    """
+    :param name: name of the tensor
+    :return: translated name
+    """
+    name = translate_name_to_gguf(name)
+    name = name.replace(".up_proj.", ".ffn_up_exps.")
+    name = name.replace(".down_proj.", ".ffn_down_exps.")
+    name = name.replace(".gate_proj.", ".ffn_gate_exps.")
+    name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias") 
+    return name
+    
+
+def combine_tensor_sources(safetensor_path:str, gguf_path:str):
+    gguf_loader = GGUFLoader(gguf_path)
+    gguf_tensor_file_map = gguf_loader.tensor_file_map
+    safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)
+    
+    # build a map for the key to the tensor
+    # according to the key, we can get the tensor from the file
+    
+    target_tensor_map = {}
+    for key in safetensor_tensor_file_map.keys():
+        # for all experts, we use the gguf tensor
+        if ".mlp.experts." in key:
+            if '.weight_scale_inv' in key:
+                continue
+            key = '.'.join(key.split('.')[:5]+key.split('.')[-2:])
+            translated_key = translate_name(key)
+            target_tensor_map[key] = gguf_tensor_file_map[translated_key]
+            continue
+        
+        if any(target_key in key for target_key in tensor_from_gguf):
+            target_tensor_map[key] = gguf_tensor_file_map[translate_name(key)]
+        else:
+            target_tensor_map[key] = safetensor_tensor_file_map[key]
+    
+    return target_tensor_map, gguf_loader
+
+def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader):
+    # Ensure output directory exists
+    os.makedirs(output_path, exist_ok=True)
+    
+    # Cache for safetensor file handles and GGUF loaders
+    safetensors_cache = {}
+    gguf_cache = {}
+    
+    # Group tensors by layer
+    layer_groups = defaultdict(list)
+    non_layer_keys = []
+    layer_pattern = re.compile(r'\.layers\.(\d+)\.')
+    
+    for key in target_tensor_map:
+        match = layer_pattern.search(key)
+        if match:
+            layer_num = int(match.group(1))
+            layer_groups[layer_num].append(key)
+        else:
+            non_layer_keys.append(key)
+    
+    # Calculate total shards
+    total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1
+    if total_shards == 0:
+        raise ValueError("No tensors to save")
+    
+    shard_idx = 0
+    
+    # Save non-layer tensors to the first shard if they exist
+    if non_layer_keys:
+        tensors = {}
+        for key in non_layer_keys:
+            file_path = target_tensor_map[key]
+            tensor = None
+            ggml_type = None
+            if file_path.endswith('.safetensors'):
+                if file_path not in safetensors_cache:
+                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
+                f = safetensors_cache[file_path]
+                tensor = f.get_tensor(key)
+            elif file_path.endswith('.gguf'):
+                gguf_name = translate_name(key)
+                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
+            else:
+                raise ValueError(f"Unsupported file format: {file_path}")
+            tensors[translate_name(key)] = tensor
+            if ggml_type:
+                ggml_type = torch.tensor(ggml_type)
+                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
+                tensors[ggml_key] = ggml_type
+        
+        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
+        print(f"Saving non-layer tensors to {output_file}")
+        save_file(tensors, output_file)
+        print(tensors.keys())
+
+        shard_idx += 1
+    
+    # Save each layer's tensors to subsequent shards
+    for layer_num in sorted(layer_groups.keys()):
+        layer_keys = layer_groups[layer_num]
+        tensors = {}
+        for key in layer_keys:
+            file_path = target_tensor_map[key]
+            tensor = None
+            ggml_type = None
+            if file_path.endswith('.safetensors'):
+                if file_path not in safetensors_cache:
+                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
+                f = safetensors_cache[file_path]
+                tensor = f.get_tensor(key)
+                tensor_info = tensor.shape
+            elif file_path.endswith('.gguf'):
+                gguf_name = translate_name(key)
+                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
+                # tensor_info = gguf_loader.tensor_info[gguf_name]
+                # ggml_type = gguf_loader.tensor_info[gguf_name]['ggml_type']
+            else:
+                raise ValueError(f"Unsupported file format: {file_path}")
+            tensors[translate_name(key)] = tensor
+            if ggml_type:
+                ggml_type = torch.tensor(ggml_type)
+                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
+                tensors[ggml_key] = ggml_type
+        
+        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
+        print(f"Saving layer {layer_num} to {output_file}")
+        # print(tensors.keys())
+        save_file(tensors, output_file)
+        shard_idx += 1
+    
+    return
+    
+def main():
+    # 创建命令行参数解析器
+    parser = argparse.ArgumentParser(description="Read parameters from Safetensor and GGUF files")
+    parser.add_argument("--safetensor_path", type=str, help="Path to the Safetensor file", default="/mnt/data/model/DeepSeek-V3")
+    parser.add_argument("--gguf_path", type=str, help="Path to the GGUF file", default="/mnt/data/model/DeepseekV3-q4km-gguf")
+    parser.add_argument("--output_path", type=str, help="Path to the output file", default="/mnt/data/model/ktrans-safetensors/DeepSeek-V3-q4km-fp8")
+    
+    # print all the arguments
+    print("All the arguments:")
+    print(parser.parse_args())
+    
+    # 解析命令行参数
+    args = parser.parse_args()
+
+    safetensor_path = args.safetensor_path
+    gguf_path = args.gguf_path
+    output_path = args.output_path
+    
+    target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path)
+    write_combined_tensor(target_tensor_map, output_path, gguf_loader)
+    
+    return
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/requirements-local_chat.txt
+++ b/requirements-local_chat.txt
@@ -4,4 +4,6 @@ numpy
 torch>=2.3.0
 packaging
 cpufeature
-protobuf
\ No newline at end of file
+protobuf
+tiktoken
+blobfile
\ No newline at end of file