Merge branch 'main' into main

ca1dc1e7 · Atream · GitHub · d3b45d57 · 505f4e2c · ca1dc1e7
Unverified Commit ca1dc1e7 authored Mar 01, 2025 by Atream Committed by GitHub Mar 01, 2025
14 changed files
--- a/ktransformers/server/main.py
+++ b/ktransformers/server/main.py
@@ -105,6 +105,7 @@ def custom_openapi(app):

 def main():
    cfg = Config()
+
    arg_parser = ArgumentParser(cfg)

    # 初始化消息

--- a/ktransformers/server/schemas/assistants/streaming.py
+++ b/ktransformers/server/schemas/assistants/streaming.py
@@ -73,7 +73,7 @@ class RunStepDelta(Object):

 class Done():
    def to_stream_reply(self):
-        return f"event: done\ndata: [DONE]\n\n"
+        return f"data: [DONE]\n\n"


 async def check_client_link(request: Request, async_events: AsyncIterable):

--- a/ktransformers/server/schemas/endpoints/chat.py
+++ b/ktransformers/server/schemas/endpoints/chat.py
@@ -25,7 +25,9 @@ class ChatCompletionCreate(BaseModel):
    messages: List[Message]
    model : str
    stream : bool = False
-
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    
    def get_tokenizer_messages(self):
        return [m.to_tokenizer_message() for m in self.messages]

@@ -75,4 +77,4 @@ class ChatCompletionChunk(ChatCompletionBase):
        ]

    def to_stream_reply(self):
-        return f"data:{self.model_dump_json()}\n\n"
+        return f"data: {self.model_dump_json()}\n\n"
--- a/ktransformers/server/schemas/legacy/completions.py
+++ b/ktransformers/server/schemas/legacy/completions.py
@@ -9,6 +9,8 @@ class CompletionCreate(BaseModel):
    model: str
    prompt: str | List[str]
    stream: bool = False
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None

    def get_tokenizer_messages(self):
        if isinstance(self.prompt,List):

--- a/ktransformers/tests/mmlu_pro_test.py
+++ b/ktransformers/tests/mmlu_pro_test.py
+import argparse
+import random
+import time
+import json
+import requests
+import pandas as pd
+from datasets import load_dataset
+
+import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+os.environ['https_proxy'] = ''
+os.environ['http_proxy'] = ''
+hint = 'There is a single choice question. Answer the question by replying A, B, C, D, E, F, G, H, I, J. No other answers are accepted. Just the letter.'
+
+
+class DataEvaluator:
+    def __init__(self):
+        # self.template_prompt = template_prompt
+        self.data = []
+
+    def load_data(self, file_path):
+        """
+        Load data from a Parquet file into a list.
+        Each record in the Parquet file should represent an individual record.
+        """
+        # 读取 Parquet 文件
+        # dataset = load_dataset('parquet', data_files=file_path)
+        ds = load_dataset("TIGER-Lab/MMLU-Pro")
+        df = pd.DataFrame(ds['test'])
+        # print(ds)
+        # # ds_1 =  ds['train']
+        # ds_2 =  ds['validation']
+        # ds_3 =  ds['test']
+        # # 将数据集转换为 Pandas DataFrame
+        # df_test = pd.DataFrame(ds['test'])
+        # df_val = pd.DataFrame(ds['validation'])
+
+        # for _, row in df.iterrows():
+        #     self.data.append(row.to_dict())
+        # df = pd.read_parquet(file_path)
+
+        for _, row in df.iterrows():
+            self.data.append(row.to_dict())
+
+    def get_prompt(self, record):
+        """
+        Combine fields from a record with the template prompt to create a full prompt.
+        :param record: Dictionary containing fields to populate the template.
+        :return: A formatted prompt string.
+        """
+        # 查看ABCD。。。的选项
+        options_str = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(record['options'])])
+        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
+        return prompt
+        
+    def post_processing(self, text):
+        """
+        Perform post-processing on the prediction string.
+        :param text: The raw prediction string.
+        :return: Processed prediction string.
+        """
+        text = text.lstrip('\n').split('\n')[0]
+        return text[:1]
+
+    def score(self, pred, answers):
+        """
+        Calculate scores between the prediction and the answer.
+        Uses ROUGE scores as the evaluation metric.
+        :param pred: The predicted string.
+        :param answer: The reference answer string.
+        :return: A dictionary containing ROUGE scores.
+        """
+        for answer in answers:
+            if pred == answer:
+                return 1
+
+        return 0
+
+# Function to generate text using API
+def generate_text(api_url, question, model_name, stream=False):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer '
+    }
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        # "temperature": 0.0
+    }
+    
+    print("POST data:", data)
+    response = requests.post(api_url, headers=headers, json=data)
+    
+    if response.status_code == 200:
+        result = response.json()
+        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+
+# Main function to handle multiple evaluations
+def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
+    start_total_time = time.time()
+
+    total_score = 0
+
+    results = []
+    # 设置随机数种子
+    random.seed(42)
+    random.shuffle(data_evaluator.data)
+    for i in range(min(concurrent_requests, len(data_evaluator.data))):
+        # Randomly select a data item from data for each request
+        data_item = data_evaluator.data[i]
+        question = data_evaluator.get_prompt(data_item)
+        # print(question)
+
+        # Start the timer for this evaluation
+        start_time = time.time()
+        try:
+            # Generate prediction using the API
+            prediction = generate_text(api_url, question, model_name)
+
+            if prediction is None:
+                raise Exception(f"Failed to get prediction for {question}")
+
+            answer = data_item['answer']
+            # Compute score
+            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)
+
+            # Calculate the time taken
+            elapsed_time = time.time() - start_time
+
+            # Collect the result data
+            result_data = {
+                "question_id": data_item['question_id'],
+                "answer": answer,
+                "prediction": data_evaluator.post_processing(prediction),
+                "score": score,
+                "time": elapsed_time
+            }
+
+            # Write results to result.json with each field on a new line
+            with open(result_file, 'a', encoding='utf-8') as f:
+                json.dump(result_data, f, ensure_ascii=False, indent=4)
+                f.write("\n")  # Ensure each JSON object is on a new line
+
+            results.append(result_data)
+
+            # Aggregate scores
+            total_score += score
+
+        except Exception as e:
+            print(f"Error processing request {i}: {e}")
+
+    # Calculate total time and throughput
+    total_time = time.time() - start_total_time
+    throughput = concurrent_requests / total_time
+
+    # Log the total time, throughput, and average ROUGE scores
+    with open(log_file, 'a', encoding='utf-8') as log_f:
+        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
+        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
+        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
+        log_f.write('-' * 40 + '\n')
+
+    print(f"Results saved to {result_file}")
+    print(f"Log saved to {log_file}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
+    parser.add_argument("--file", type=str, default="TIGER-Lab/MMLU-Pro", help="Path to the mmlu.jsonl file")
+    parser.add_argument("--result", type=str, default="./mmlu_result_pro.json", help="Path to save the result JSON file")
+    parser.add_argument("--log", type=str, default="./mmlu_result_pro.log", help="Path to save the log file")
+    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
+    parser.add_argument("--api_url", type=str, default="http://localhost:15488/v1/chat/completions", help="API URL")
+    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+
+    args = parser.parse_args()
+
+    # Load the data from the provided file
+    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
+    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
+        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"
+
+
+    # Load the data from the provided file
+    data_evaluator = DataEvaluator()
+    data_evaluator.load_data(args.file)
+
+    # Run the main function with the specified number of concurrent evaluations
+    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)
\ No newline at end of file
--- a/ktransformers/tests/mmlu_test.py
+++ b/ktransformers/tests/mmlu_test.py
+import argparse
+import random
+import time
+import json
+import requests
+import pandas as pd
+from datasets import load_dataset
+
+import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+os.environ['https_proxy'] = ''
+os.environ['http_proxy'] = ''
+hint = 'There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter.'
+
+
+class DataEvaluator:
+    def __init__(self):
+        # self.template_prompt = template_prompt
+        self.data = []
+
+    def load_data(self, file_path):
+        """
+        Load data from a Parquet file into a list.
+        Each record in the Parquet file should represent an individual record.
+        """
+        # 读取 Parquet 文件
+        # dataset = load_dataset('parquet', data_files=file_path)
+        ds = load_dataset(file_path,"all")
+        df = pd.DataFrame(ds['test'])
+        # print(ds)
+        # # ds_1 =  ds['train']
+        # ds_2 =  ds['validation']
+        # ds_3 =  ds['test']
+        # # 将数据集转换为 Pandas DataFrame
+        # df_test = pd.DataFrame(ds['test'])
+        # df_val = pd.DataFrame(ds['validation'])
+
+        # for _, row in df.iterrows():
+        #     self.data.append(row.to_dict())
+        # df = pd.read_parquet(file_path)
+
+        for _, row in df.iterrows():
+            self.data.append(row.to_dict())
+
+    def get_prompt(self, record):
+        """
+        Combine fields from a record with the template prompt to create a full prompt.
+        :param record: Dictionary containing fields to populate the template.
+        :return: A formatted prompt string.
+        """
+        # 查看ABCD。。。的选项
+        options_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(record['choices'])])
+        prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
+        return prompt
+        
+    def post_processing(self, text):
+        """
+        Perform post-processing on the prediction string.
+        :param text: The raw prediction string.
+        :return: Processed prediction string.
+        """
+        text = text.lstrip('\n').split('\n')[0]
+        return text[:1]
+
+    def score(self, pred, answers):
+        """
+        Calculate scores between the prediction and the answer.
+        Uses ROUGE scores as the evaluation metric.
+        :param pred: The predicted string.
+        :param answer: The reference answer string.
+        :return: A dictionary containing ROUGE scores.
+        """
+        for answer in answers:
+            if pred == answer:
+                return 1
+
+        return 0
+
+# Function to generate text using API
+def generate_text(api_url, question, model_name, stream=False):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer '
+    }
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        # "temperature": 0.0
+    }
+    
+    print("POST data:", data)
+    response = requests.post(api_url, headers=headers, json=data)
+    
+    if response.status_code == 200:
+        result = response.json()
+        return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+
+# Main function to handle multiple evaluations
+def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
+    start_total_time = time.time()
+
+    total_score = 0
+
+    results = []
+   # 设置随机数种子
+    random.seed(42)
+    random.shuffle(data_evaluator.data)
+    for i in range(min(concurrent_requests, len(data_evaluator.data))):
+        # Randomly select a data item from data for each request
+        data_item = data_evaluator.data[i]
+        question = data_evaluator.get_prompt(data_item)
+        # print(question)
+
+        # Start the timer for this evaluation
+        start_time = time.time()
+        try:
+            # Generate prediction using the API
+            prediction = generate_text(api_url, question, model_name)
+
+            if prediction is None:
+                raise Exception(f"Failed to get prediction for {question}")
+
+            answer = chr(data_item['answer'] + 65)
+            # Compute score
+            score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)
+
+            # Calculate the time taken
+            elapsed_time = time.time() - start_time
+
+            # Collect the result data
+            result_data = {
+                "question_id": i,
+                "answer": answer,
+                "prediction": data_evaluator.post_processing(prediction),
+                "score": score,
+                "time": elapsed_time
+            }
+
+            # Write results to result.json with each field on a new line
+            with open(result_file, 'a', encoding='utf-8') as f:
+                json.dump(result_data, f, ensure_ascii=False, indent=4)
+                f.write("\n")  # Ensure each JSON object is on a new line
+
+            results.append(result_data)
+
+            # Aggregate scores
+            total_score += score
+
+        except Exception as e:
+            print(f"Error processing request {i}: {e}")
+
+    # Calculate total time and throughput
+    total_time = time.time() - start_total_time
+    throughput = concurrent_requests / total_time
+
+    # Log the total time, throughput, and average ROUGE scores
+    with open(log_file, 'a', encoding='utf-8') as log_f:
+        log_f.write(f"Total Time: {total_time:.2f} seconds\n")
+        log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
+        log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
+        log_f.write('-' * 40 + '\n')
+
+    print(f"Results saved to {result_file}")
+    print(f"Log saved to {log_file}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
+    parser.add_argument("--file", type=str, default="cais/mmlu", help="Path to the mmlu.jsonl file")
+    parser.add_argument("--result", type=str, default="./mmlu_result_silicon.json", help="Path to save the result JSON file")
+    parser.add_argument("--log", type=str, default="./mmlu_result_silicon.log", help="Path to save the log file")
+    parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
+    parser.add_argument("--api_url", type=str, default="http://localhost:10003/v1/chat/completions", help="API URL")
+    # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+
+    args = parser.parse_args()
+
+    # Load the data from the provided file
+    # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
+    # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
+        # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"
+
+
+    # Load the data from the provided file
+    data_evaluator = DataEvaluator()
+    data_evaluator.load_data(args.file)
+
+    # Run the main function with the specified number of concurrent evaluations
+    main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)
\ No newline at end of file
--- a/ktransformers/tests/triton_fp8gemm_test.py
+++ b/ktransformers/tests/triton_fp8gemm_test.py
+import torch
+import torch.nn.functional as F
+from typing import Optional
+import pytest
+from typing import Tuple, Optional, Literal
+import time
+# use dir path
+import os
+import sys
+sys.path.insert(0, "/home/azure/ktransformers")
+print(sys.path)
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
+from safetensors import safe_open
+
+world_size = 1
+rank = 0
+block_size = 128
+gemm_impl: Literal["bf16", "fp8"] = "bf16"
+# Assuming `fp8_gemm`, `act_quant`, `weight_dequant` and other relevant functions are already defined
+
+def test_fp8_gemm_vs_torch_matmul():
+    # Test case 1: Create random matrices of size (M, K) and (K, N)
+    M, K, N = 64, 128, 256  # Matrix dimensions
+    x = torch.randn(M, K, dtype=torch.bfloat16, device='cuda')
+    weight = torch.randn(N, K, dtype=torch.bfloat16, device='cuda')
+
+    # Apply act_quant to both matrices
+    x_quantized, scale_x = act_quant(x, block_size)
+    weight_quantized, scale_w = act_quant(weight, block_size)
+    
+    # mk continous
+    x_quantized = x_quantized.contiguous()
+    weight_quantized = weight_quantized.contiguous()
+    scale_x = scale_x.contiguous()
+    scale_w = scale_w.contiguous()
+
+    # Perform fp8_gemm using the quantized tensors
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight_quantized, scale_w)
+
+    # Perform torch.matmul using the original floating point tensors
+    result_torch_matmul = torch.matmul(x, weight.T)
+    print(f'result_torch_matmul: {result_torch_matmul.shape}')
+    print(f'result_fp8_gemm: {result_fp8_gemm.shape}')
+
+    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
+    print(f"result_torch_matmul:\n {result_torch_matmul}")
+    
+def test_fp8_gemm_vs_torch_matmul_load():
+    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
+    with safe_open(file_path, framework="pt", device=0) as f:
+        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
+        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
+
+    # weight_dequant
+    weight_dequantized = weight_dequant(weight, scale)
+    print(f"weight_dequantized: {weight_dequantized.shape}")
+    N, K = weight_dequantized.shape
+    M = 64
+    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
+    x_quantized, scale_x = act_quant(x, block_size)
+    
+    # Test case 1: quantized x matmal with undequantized weight
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    print(f"result_fp8_gemm:\n {result_fp8_gemm}")
+    print(f"dtype {result_fp8_gemm.dtype}")
+
+    # Perform torch.matmul using the original floating point tensors
+    result_torch_matmul = torch.matmul(x, weight_dequantized.to(torch.bfloat16).T)
+    print(f"result_torch_matmul:\n {result_torch_matmul}")
+
+def test_fp8_gemm_tplops():
+    file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
+    with safe_open(file_path, framework="pt", device=0) as f:
+        weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
+        scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
+
+    # weight_dequant
+    weight_dequantized = weight_dequant(weight, scale)
+    print(f"weight_dequantized: {weight_dequantized.shape}")
+    N, K = weight_dequantized.shape
+    M = 6400
+    x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
+    # x_quantized, scale_x = act_quant(x, block_size)
+    
+    # Calculate time for 1000 fp8_gemm
+    i = 10
+    flops_per_gemm = 2 * M * N * K
+    total_flops = i * flops_per_gemm
+    
+    x_quantized, scale_x = act_quant(x, block_size)
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    x_quantized, scale_x = act_quant(x, block_size)
+    result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+
+    
+    t0 = time.time()
+    torch.cuda.synchronize()
+    for i in range(i):
+        x_quantized, scale_x = act_quant(x, block_size)
+        result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
+    torch.cuda.synchronize()
+    t1 = time.time()
+    
+    total_time = t1 - t0
+    tflops = total_flops / total_time / 1e12
+    print(f"total_time: {total_time}")
+    print(f"tflops: {tflops}")
+    
+
+    
+    
+if __name__ == "__main__":
+    test_fp8_gemm_vs_torch_matmul()
+    test_fp8_gemm_vs_torch_matmul_load()
+    test_fp8_gemm_tplops()
+    
\ No newline at end of file
--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@@ -25,6 +25,9 @@ import os
 from enum import IntEnum
 import torch
 import KTransformersOps
+from .custom_loader import SafeTensorLoader
+import ctypes
+import math

 class GGMLQuantizationType(IntEnum):
    F32     = 0
@@ -109,6 +112,7 @@ GGML_TYPES = {
    "Q5_K": 13,
    "Q6_K": 14,
    "IQ4_XS": 23,
+    "BF16": 30,
 }

 GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
@@ -116,6 +120,7 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
 GGML_BLOCK_SIZES = {
    "F32": 4,
    "F16": 2,
+    "BF16": 2,
    "Q4_0": 2 + 16,
    "Q5_0": 2 + 4 + 16,
    "Q8_0": 2 + 32,
@@ -125,11 +130,13 @@ GGML_BLOCK_SIZES = {
    "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
    "Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2,
    "IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64,
+    "FP8": 1,
 }

 GGML_ELEMENTS_PER_BLOCK = {
    "F32": 1,
    "F16": 1,
+    "BF16": 1,
    "Q4_0": 32,
    "Q5_0": 32,
    "Q8_0": 32,
@@ -139,6 +146,7 @@ GGML_ELEMENTS_PER_BLOCK = {
    "Q5_K": 256,
    "Q6_K": 256,
    "IQ4_XS": 256,
+    "FP8": 1,
 }

 DATA_TYPES = {
@@ -155,6 +163,7 @@ DATA_TYPES = {
    "uint64": 10,
    "int64": 11,
    "float64": 12,
+    "FP8": 13,
 }

 class GGUFLoader:
@@ -162,10 +171,15 @@ class GGUFLoader:
    gguf_path: str
    tensor_file_map: dict # {tensor_name: tensor_file_path}
    gguf_file_meta: dict
+    safetensor_loader: SafeTensorLoader
    def __init__(self, gguf_path: str):
        # Check dir exist
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
+        if os.path.isfile(gguf_path):
+            gguf_path = os.path.dirname(gguf_path)
+
+        self.safetensor_loader = None
        
        self.tensor_info = {}
        self.gguf_path = gguf_path
@@ -173,16 +187,26 @@ class GGUFLoader:
        self.file_data_map = {}
        self.gguf_file_meta = {}
        self.tensor_device_map = {}
-        
+
+        # I know this is ugly, but I don't want to change the original code too much
+        # TODO: merge gguf load and other loads.
+        safetensor_loader = SafeTensorLoader(gguf_path)
+        if safetensor_loader.tensor_file_map:
+            self.safetensor_loader = safetensor_loader
+            return
        # Walk through all the .gguf files in the directory
+        found_gguf = False
        for root, dirs, files in os.walk(gguf_path):
            for file in files:
                if file.endswith(".gguf"):
+                    found_gguf = True
                    file_name = os.path.join(root, file)
                    with open(file_name, "rb") as f:
                        self.load_gguf(f)
                        if file_name not in self.file_data_map:
                            self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
+        if not found_gguf:
+            raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
                            
    def load_gguf(self, f):
        f.seek(0)
@@ -207,7 +231,7 @@ class GGUFLoader:
            shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
            ggml_type = read_value(f, DATA_TYPES["uint32"])
            bad_offset = read_value(f, DATA_TYPES["uint64"])
-            n_elems = int(np.prod(shape))
+            n_elems = int(math.prod(shape))
            block_size, type_size = GGML_QUANT_SIZES[ggml_type]
            n_bytes = n_elems * type_size // block_size
            np_dims = tuple(reversed(shape))
@@ -276,8 +300,49 @@ class GGUFLoader:
        itemsize = int(np.empty([], dtype = item_type).itemsize)
        return mmap_data[offset : offset + itemsize * item_count]
    
-    def load_gguf_tensor(self, name: str, device:str = "cpu")->torch.Tensor:
+    def get_undequanted_tensor_and_ggml_type(self, name):
+        t = self.tensor_info[name]
+        data = self.get_mmap_tensor(name)
+        ggml_type = t["ggml_type"]
+        data = torch.from_numpy(data)
+        return data, ggml_type
+
+    def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
        t = self.tensor_info[name]
+        if device.lower() == "cpu":
+            print(f"loading expert {expert_id} of {name} with CPU")
+        shape = t["shape"]
+        ggml_type = t["ggml_type"]
+        if ggml_type not in GGML_NAMES:
+            raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
+        ggml_name = GGML_NAMES[ggml_type]
+
+        # TODO: experts may fused in quant block, split it
+        assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
+
+        blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
+        block_size = GGML_BLOCK_SIZES[ggml_name]
+        offset = expert_id * block_size * blocks_per_experts
+        data = data[offset: offset + block_size * blocks_per_experts]
+        
+        if "cuda" in device.lower():
+            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
+        else:
+            values = GGML_DEQUANTIZE[ggml_name](data)
+            values = torch.from_numpy(values.copy())
+
+        if ggml_name == "BF16":
+            values = values.view(torch.bfloat16)
+        values = values.view(shape[-2::-1])
+
+        return values
+
+    def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
+        t = self.tensor_info[name]
+        if device.lower() == "cpu":
+            print(f"loading {name} with CPU")
+        if target_dtype == None:
+            target_dtype = torch.get_default_dtype()
        
        shape = t["shape"]
        ggml_type = t["ggml_type"]
@@ -289,14 +354,38 @@ class GGUFLoader:

        data = self.get_mmap_tensor(name)

-        if "cuda" in device.lower():
-            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
-            #values = GGML_DEQUANTIZE[ggml_name](data)
-            #print("load_gguf_tensor")
-            #values = torch.from_numpy(values).to(device = device)
+        block_size = GGML_BLOCK_SIZES[ggml_name]
+        elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
+        num_elements = int(np.prod(shape))
+        num_blocks = num_elements // elements_per_block
+        
+        blocks_per_iter = 16384
+        if num_blocks > blocks_per_iter: # dequant large tensor
+            values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
+            for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
+                blocks_begin = i * blocks_per_iter
+                blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
+                if "cuda" in device.lower():
+                    cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
+                else:
+                    cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
+                    cur_values = torch.from_numpy(cur_values.copy())
+                
+                cur_values = cur_values.view(-1, elements_per_block)
+                if ggml_name == "BF16":
+                    cur_values = cur_values.view(torch.bfloat16)
+                values[blocks_begin : blocks_end] = cur_values
        else:
-            values = GGML_DEQUANTIZE[ggml_name](data)
-            values = torch.from_numpy(values)
+            if "cuda" in device.lower():
+                values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
+            else:
+                values = GGML_DEQUANTIZE[ggml_name](data)
+                values = torch.from_numpy(values)
+                
+        if ggml_name == "BF16":
+            values = values.view(torch.bfloat16)
+            
+
        values = values.view(shape[::-1])
        if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
            n_head = self.gguf_file_meta['llama.attention.head_count']
@@ -352,6 +441,9 @@ def read_value(f, data_type):
        elem_type, count = struct.unpack("<IQ", f.read(4 + 8))
        return [read_value(f, elem_type) for _ in range(count)]

+    elif data_type == DATA_TYPES["FP8"]:
+        return struct.unpack("<B", f.read(1))[0]
+
    else:
        raise NotImplementedError(f"Data type {data_type} not implemented")

@@ -392,14 +484,15 @@ def dequantize_q2_k(data):

    return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)

-def dequantize_q2_k_gpu(data, device:str ="cuda"):
+def dequantize_q2_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q2_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q2_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q2_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q2_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

 def dequantize_q3_k(data):
    # C implementation
@@ -443,14 +536,15 @@ def dequantize_q3_k(data):
        (((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7])
    ], axis=1)

-def dequantize_q3_k_gpu(data, device:str ="cuda"):
+def dequantize_q3_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q3_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q3_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q3_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q3_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

 def dequantize_q4_k(data):
    # C implementation
@@ -474,13 +568,15 @@ def dequantize_q4_k(data):
    # Dequantize final weights using scales and offsets
    return factors * qs2 - offsets

-def dequantize_q4_k_gpu(data, device:str ="cuda"):
+def dequantize_q4_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
+    block_size = GGML_BLOCK_SIZES["Q4_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q4_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q4_k(data, 144, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q4_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

 def dequantize_q5_k(data):
    # C implementation
@@ -538,14 +634,15 @@ def dequantize_q5_k(data):
        d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
    ], axis=1)

-def dequantize_q5_k_gpu(data, device:str ="cuda"):
+def dequantize_q5_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q5_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q5_K"]
    data = np.frombuffer(data, dtype=data.dtype)
    device = torch.device(device)
    # TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable, 
    # the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q5_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q5_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

 def dequantize_q6_k(data):
    # C implementation
@@ -596,13 +693,14 @@ def dequantize_q6_k(data):
    ], axis=1) 

 # @torch.jit.script
-def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
+def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["Q6_K"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q6_K"]
    device = torch.device(device)
    num_blocks = len(data) // block_size
    data = np.frombuffer(data, dtype=data.dtype)
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q6_k(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q6_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

 kvalues_iq4nl = np.array([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=np.int8)

@@ -636,13 +734,14 @@ def dequantize_iq4_xs(data):

    return y.flatten()

-def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda"):
+def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    block_size = GGML_BLOCK_SIZES["IQ4_XS"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["IQ4_XS"]
    device = torch.device(device)
    num_blocks = len(data) // block_size
    data = np.frombuffer(data, dtype=data.dtype)
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_iq4_xs(data, block_size, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_iq4_xs(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)

 def dequantize_q4_0(data):
    # C implementation
@@ -659,7 +758,7 @@ def dequantize_q4_0(data):
        scales * ((qs >> 4).astype(np.int8) - 8),
    ], axis=1)

-def dequantize_q4_0_gpu(data):
+def dequantize_q4_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    raise NotImplementedError()

 def dequantize_q5_0(data):
@@ -683,7 +782,7 @@ def dequantize_q5_0(data):
        scales * x1,
    ], axis=1)

-def dequantize_q5_0_gpu(data):
+def dequantize_q5_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    raise NotImplementedError()

 def dequantize_q8_0(data):
@@ -695,32 +794,41 @@ def dequantize_q8_0(data):
    qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
    return scales * qs

-def dequantize_q8_0_gpu(data, device:str = "cuda"):
+def dequantize_q8_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
-    num_blocks = len(data) // GGML_BLOCK_SIZES["Q8_0"]
+    
+    block_size = GGML_BLOCK_SIZES["Q8_0"]
+    ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q8_0"]
    device = torch.device(device)
    data = np.frombuffer(data, dtype=data.dtype)
-    data = torch.from_numpy(data)
-    return KTransformersOps.dequantize_q8_0(data, 34, device)
+    c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
+    return KTransformersOps.dequantize_q8_0(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)


 def dequantize_f32(data):
    return np.frombuffer(data, dtype=np.float32)

-def dequantize_f32_gpu(data, device):
+def dequantize_f32_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float32)
-    res = torch.from_numpy(data)
-    res_gpu = torch.empty_like(res, device=device)
+    res = torch.from_numpy(data.copy())
+    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
    res_gpu.copy_(res)
    return res_gpu

 def dequantize_f16(data):
    return np.frombuffer(data, dtype=np.float16)

-def dequantize_f16_gpu(data, device):
+def dequantize_f16_gpu(data, device, target_dtype = torch.get_default_dtype()):
+    data = np.frombuffer(data, dtype=np.float16)
+    res = torch.from_numpy(data.copy())
+    res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
+    res_gpu.copy_(res)
+    return res_gpu
+
+def dequantize_bf16_gpu(data, device, target_dtype = torch.get_default_dtype()):
    data = np.frombuffer(data, dtype=np.float16)
-    res = torch.from_numpy(data)
+    res = torch.from_numpy(data.copy())
    res_gpu = torch.empty_like(res, device=device)
    res_gpu.copy_(res)
    return res_gpu
@@ -728,6 +836,7 @@ def dequantize_f16_gpu(data, device):
 GGML_DEQUANTIZE = {
    "F32": dequantize_f32,
    "F16": dequantize_f16,
+    "BF16": dequantize_f16,
    "Q4_0": dequantize_q4_0,
    "Q5_0": dequantize_q5_0,
    "Q8_0": dequantize_q8_0,
@@ -742,6 +851,7 @@ GGML_DEQUANTIZE = {
 GGML_DEQUANTIZE_GPU = {
    "F32": dequantize_f32_gpu,
    "F16": dequantize_f16_gpu,
+    "BF16": dequantize_bf16_gpu,
    "Q4_0": dequantize_q4_0_gpu,
    "Q5_0": dequantize_q5_0_gpu,
    "Q8_0": dequantize_q8_0_gpu,

--- a/ktransformers/util/custom_loader.py
+++ b/ktransformers/util/custom_loader.py
+import struct
+import warnings
+import numpy as np
+import re
+import numpy.typing as npt
+from typing import Sequence
+import os
+from enum import IntEnum
+import torch
+import KTransformersOps
+from safetensors import safe_open
+from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
+from safetensors.torch import save_file
+
+class SafeTensorLoader:
+    tensor_file_map = {}
+    tensor_type_map = {}
+    file_handle_map = {}
+    
+    def __init__(self, file_path: str):
+        self.__load_tensor_file_map(file_path)
+
+    def __load_tensor_file_map(self, file_path: str):
+        # 处理传入路径，确保是文件夹路径
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Path not found: {file_path}")
+        if os.path.isfile(file_path):
+            folder_path = os.path.dirname(file_path)
+        else:
+            folder_path = file_path
+
+        found_safetensor = False
+        for root, _, files in os.walk(folder_path):
+            files = sorted(files)
+            for file in files:
+                if file.endswith(".safetensors"):
+                    found_safetensor = True
+                    file_path = os.path.join(root, file)
+                    if file not in self.file_handle_map:
+                        try:
+                            handle = safe_open(file_path, framework="pt")
+                            self.file_handle_map[file] = handle
+                        except Exception as e:
+                            print(f"Error opening Safetensor file {file_path}: {e}")
+                            continue
+
+                    f = self.file_handle_map.get(file)
+                    if f is None:
+                        continue
+                    try:
+                        for key in f.keys():
+                            self.tensor_file_map[key] = file
+                    except Exception as e:
+                        print(f"Error reading Safetensor file {file_path}: {e}")
+
+        # if not found_safetensor:
+        #     raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
+
+    def load_tensor(self, key: str, device: str="cpu"):
+        if key not in self.tensor_file_map:
+            raise KeyError(f"Key {key} not found in Safetensor files")
+        file = self.tensor_file_map[key]
+        f = self.file_handle_map.get(file)
+        if f is None:
+            raise FileNotFoundError(f"File {file} not found in Safetensor files")
+        tensor = f.get_tensor(key)
+        return tensor.to(device)
+
+    def close_all_handles(self):
+        for handle in self.file_handle_map.values():
+            handle.close()
+        self.file_handle_map.clear()
+
+    def load_dequantized_tensor(self, key:str, device: str="cpu"):
+        if key not in self.tensor_file_map:
+            raise KeyError(f"Key {key} not found in Safetensor files")
+        file = self.tensor_file_map[key]
+        f = self.file_handle_map.get(file)
+        if f is None:
+            raise FileNotFoundError(f"File {file} not found in Safetensor files")
+        tensor = f.get_tensor(key).to(device)
+        if key.endswith(".weight"):
+            if key[:-7] + ".weight_scale_inv" in self.tensor_file_map:
+                weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
+                tensor = weight_dequant(tensor, weight_scale_inv)
+        return tensor.to(device)
\ No newline at end of file
--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@@ -17,9 +17,22 @@ from ktransformers.operators import base_operator
 from ktransformers.models.custom_cache import StaticCache
 from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
 from ktransformers.util.textstream import TextStreamer
+from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton

 warm_uped = False

+def get_compute_capability(device:torch.device = None):
+    if torch.cuda.is_available():
+        if device is None:
+            num_gpus = torch.cuda.device_count()
+            min_compute_capability_major = 100
+            for gpu_id in range(num_gpus):
+                gpu_props = torch.cuda.get_device_properties(gpu_id)
+                min_compute_capability_major = min(min_compute_capability_major, gpu_props.major)
+            return min_compute_capability_major
+        else:
+            return torch.cuda.get_device_properties(device)
+
 def set_module(model, submodule_key, module):
    tokens = submodule_key.split('.')
    sub_tokens = tokens[:-1]
@@ -65,12 +78,22 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
    for name, param in local_state.items():
        key = prefix + name
        translated_key = translate_name_to_gguf(key)
-        if translated_key in gguf_loader.tensor_file_map:
+        
+        # TODO: Merge all loader.
+        # I know this is ugly but lets do it for now.
+        if gguf_loader.safetensor_loader is not None:
+            load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
+            tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
+        else:
+            load_dequantized_tensor = gguf_loader.load_gguf_tensor
+            tensor_file_map = gguf_loader.tensor_file_map
+        
+        if translated_key in tensor_file_map:
            target_dtype = torch.get_default_dtype()
            device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
            print(f"loading {translated_key} to {device}")
-            # device = "cpu" if "embd" in translated_key else "cuda"
-            weights = gguf_loader.load_gguf_tensor(translated_key, device = device).to(dtype = target_dtype)
+            torch.cuda.empty_cache()
+            weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
            set_param(module, name, weights)
            del weights
        else:
@@ -78,7 +101,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
            raise Exception(f"can't find {translated_key} in GGUF file!")
        
 def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
-    # print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
+    #print(f"recursively loading weights {prefix}")
    if not isinstance(module, base_operator.BaseInjectedModule):
        load_cur_state_dict(module, gguf_loader, prefix)
        for name, child in module._modules.items():
@@ -87,7 +110,8 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
        module.load()

 def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
-                         mode = 'normal', force_think: bool = False):
+                         mode = 'normal', force_think: bool = False, chunk_prefill_size = 16384, use_flashinfer_mla = False,
+                         num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None):
    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    torch._dynamo.config.suppress_errors = True
@@ -100,7 +124,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud

    tokens = []
    
-    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, use_cuda_graph: bool = True):
+    def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True):
        if cuda_graph_runner is None:
            use_cuda_graph = False
        if use_cuda_graph:
@@ -128,8 +152,25 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            next_token = torch.argmax(next_token_scores, dim=-1)
        return next_token
    
+    # TODO: use CUDA Graph for chunk prefill, may get small improvement
+    def chunk_prefill(inputs, cache_position, past_key_values):
+        if mode == "long_context":
+            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
+        else:
+            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
+        if use_flashinfer_mla:
+            MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
+            MLAWrapperSingleton.need_plan_all()
+            
+        logits = model(
+            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
+        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
+        
+        return logits
+    
    torch.cuda.set_device(torch_device)
    with torch.no_grad():
+        
        stream = TextStreamer(tokenizer)
        if mode != 'long_context':
            past_key_values = StaticCache(
@@ -137,26 +178,11 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            )
        else:
            past_key_values = None
-        cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.long)
-        generated_ids = torch.zeros(
-            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
-        )
-        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
-        if past_key_values != None:
-            past_key_values.cur_idx=cache_position
-        start_time = time.time()
-
-        inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
-        if mode == "long_context":
-            inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
-        else:
-            inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
-        logits = model(
-            inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
-        )[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
+        
        generation_config, model_kwargs = model._prepare_generation_config(
-            None, max_length=max_new_tokens,
-            do_sample=True, top_k=5, top_p=0.85, temperature=0.1 # change this to modify generate config
+            None, do_sample=True
+            # change this to modify generate config
+            #top_k=5, top_p=0.85, temperature=0.1
        )
        try: # transformers==4.43
            logits_warper = (
@@ -166,23 +192,43 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            logits_warper = (
                model._get_logits_warper(generation_config)
            )
+
+        cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32)
+        generated_ids = torch.zeros(
+            batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
+        )
+        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
+        start_time = time.time()
+
+        chunk_start = 0
+        while chunk_start < seq_length:
+            chunk_end = min(chunk_start + chunk_prefill_size, seq_length)
+            if past_key_values != None:
+                past_key_values.cur_idx=cache_position[chunk_start:chunk_end]
+            logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values)
+            chunk_start += chunk_prefill_size
+
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
        if generation_config.do_sample:
            probs = nn.functional.softmax(next_token_scores, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
        else:
            next_token = torch.argmax(next_token_scores, dim=-1)
+
        first_token_time = time.time() - start_time
+        
+        if use_flashinfer_mla:
+            MLAWrapperSingleton.reset_buffer()

        prefill_count = seq_length
        prefill_time = first_token_time
        if force_think:
-            print("<think>\n")
+            print("<think>")
        print(stream.put(next_token.item()), end="", flush=True)
        generated_ids[:, seq_length] = next_token
        tokens.append(int(next_token))
        inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
-        cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.long)
+        cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
        position_ids = cache_position.unsqueeze(0)
        seq_length += 1
        
@@ -190,19 +236,22 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
            
        start_time = time.time()
        for i in range(1, max_new_tokens):
+            if use_flashinfer_mla:
+                MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,
+                                             num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
+                                             q_head_dim ** (-0.5), torch.bfloat16, torch.bfloat16)
            global warm_uped
            if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
                warm_uped = True
                cuda_graph_runner = CUDAGraphRunner()
                cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
-                
-            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
+            next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device)
            inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
            generated_ids[:, cache_position] = next_token.int()
            tokens.append(int(next_token))
            seq_length += 1
            
-            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
+            if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
                print(stream.end(), end="", flush=True)
                break
            else:

--- a/merge_tensors/merge_safetensor_gguf.py
+++ b/merge_tensors/merge_safetensor_gguf.py
+# this script targets to merge the fp8 safe tensor and the gguf quantized tensors.
+
+import os
+# insert the path of the project
+import sys
+sys.path.insert(0, "/home/azure/ktransformers")
+import argparse
+import torch
+from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
+from safetensors import safe_open
+from safetensors.torch import save_file
+import re
+from collections import defaultdict
+
+def read_safetensor_keys_from_folder(folder_path)->dict:
+    """    
+    :param folder_path: folder path
+    :return: key_to_file_map
+    """
+    # check if the folder path is exist
+    if not os.path.exists(folder_path):
+        raise FileNotFoundError(f"GGUF dir not found: {folder_path}")
+    if os.path.isfile(folder_path):
+        folder_path = os.path.dirname(folder_path)
+    
+    key_to_file_map = {}
+
+    found_safetensor = False
+    for root, dirs, files in os.walk(folder_path):
+        # sort files
+        files = sorted(files)
+        for file in files:
+            if file.endswith(".safetensors"):
+                found_safetensor = True
+                file_path = os.path.join(root, file)
+                try:
+                    with safe_open(file_path, framework="pt") as f:
+                        for key in f.keys():
+                            if "model.layers.61" in key:
+                                # skip MTP layer
+                                continue
+                            # try:
+                            #     if int(key.split('.')[2]) > 4:
+                            #         continue
+                            # except:
+                            #     pass
+                            key_to_file_map[key] = file_path
+                except Exception as e:
+                    print(f"Error reading Safetensor file {file_path}: {e}")
+    
+    if not found_safetensor:
+        raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
+    
+    return key_to_file_map
+
+tensor_from_gguf = [] # todo: add keys in gguf that should be used in the final tensor
+
+def translate_name(name:str)->str:
+    """
+    :param name: name of the tensor
+    :return: translated name
+    """
+    name = translate_name_to_gguf(name)
+    name = name.replace(".up_proj.", ".ffn_up_exps.")
+    name = name.replace(".down_proj.", ".ffn_down_exps.")
+    name = name.replace(".gate_proj.", ".ffn_gate_exps.")
+    name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias") 
+    return name
+    
+
+def combine_tensor_sources(safetensor_path:str, gguf_path:str):
+    gguf_loader = GGUFLoader(gguf_path)
+    gguf_tensor_file_map = gguf_loader.tensor_file_map
+    safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)
+    
+    # build a map for the key to the tensor
+    # according to the key, we can get the tensor from the file
+    
+    target_tensor_map = {}
+    for key in safetensor_tensor_file_map.keys():
+        # for all experts, we use the gguf tensor
+        if ".mlp.experts." in key:
+            if '.weight_scale_inv' in key:
+                continue
+            key = '.'.join(key.split('.')[:5]+key.split('.')[-2:])
+            translated_key = translate_name(key)
+            target_tensor_map[key] = gguf_tensor_file_map[translated_key]
+            continue
+        
+        if any(target_key in key for target_key in tensor_from_gguf):
+            target_tensor_map[key] = gguf_tensor_file_map[translate_name(key)]
+        else:
+            target_tensor_map[key] = safetensor_tensor_file_map[key]
+    
+    return target_tensor_map, gguf_loader
+
+def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader):
+    # Ensure output directory exists
+    os.makedirs(output_path, exist_ok=True)
+    
+    # Cache for safetensor file handles and GGUF loaders
+    safetensors_cache = {}
+    gguf_cache = {}
+    
+    # Group tensors by layer
+    layer_groups = defaultdict(list)
+    non_layer_keys = []
+    layer_pattern = re.compile(r'\.layers\.(\d+)\.')
+    
+    for key in target_tensor_map:
+        match = layer_pattern.search(key)
+        if match:
+            layer_num = int(match.group(1))
+            layer_groups[layer_num].append(key)
+        else:
+            non_layer_keys.append(key)
+    
+    # Calculate total shards
+    total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1
+    if total_shards == 0:
+        raise ValueError("No tensors to save")
+    
+    shard_idx = 0
+    
+    # Save non-layer tensors to the first shard if they exist
+    if non_layer_keys:
+        tensors = {}
+        for key in non_layer_keys:
+            file_path = target_tensor_map[key]
+            tensor = None
+            ggml_type = None
+            if file_path.endswith('.safetensors'):
+                if file_path not in safetensors_cache:
+                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
+                f = safetensors_cache[file_path]
+                tensor = f.get_tensor(key)
+            elif file_path.endswith('.gguf'):
+                gguf_name = translate_name(key)
+                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
+            else:
+                raise ValueError(f"Unsupported file format: {file_path}")
+            tensors[translate_name(key)] = tensor
+            if ggml_type:
+                ggml_type = torch.tensor(ggml_type)
+                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
+                tensors[ggml_key] = ggml_type
+        
+        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
+        print(f"Saving non-layer tensors to {output_file}")
+        save_file(tensors, output_file)
+        print(tensors.keys())
+
+        shard_idx += 1
+    
+    # Save each layer's tensors to subsequent shards
+    for layer_num in sorted(layer_groups.keys()):
+        layer_keys = layer_groups[layer_num]
+        tensors = {}
+        for key in layer_keys:
+            file_path = target_tensor_map[key]
+            tensor = None
+            ggml_type = None
+            if file_path.endswith('.safetensors'):
+                if file_path not in safetensors_cache:
+                    safetensors_cache[file_path] = safe_open(file_path, framework='pt')
+                f = safetensors_cache[file_path]
+                tensor = f.get_tensor(key)
+                tensor_info = tensor.shape
+            elif file_path.endswith('.gguf'):
+                gguf_name = translate_name(key)
+                tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
+                # tensor_info = gguf_loader.tensor_info[gguf_name]
+                # ggml_type = gguf_loader.tensor_info[gguf_name]['ggml_type']
+            else:
+                raise ValueError(f"Unsupported file format: {file_path}")
+            tensors[translate_name(key)] = tensor
+            if ggml_type:
+                ggml_type = torch.tensor(ggml_type)
+                ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
+                tensors[ggml_key] = ggml_type
+        
+        output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
+        print(f"Saving layer {layer_num} to {output_file}")
+        # print(tensors.keys())
+        save_file(tensors, output_file)
+        shard_idx += 1
+    
+    return
+    
+def main():
+    # 创建命令行参数解析器
+    parser = argparse.ArgumentParser(description="Read parameters from Safetensor and GGUF files")
+    parser.add_argument("--safetensor_path", type=str, help="Path to the Safetensor file", default="/mnt/data/model/DeepSeek-V3")
+    parser.add_argument("--gguf_path", type=str, help="Path to the GGUF file", default="/mnt/data/model/DeepseekV3-q4km-gguf")
+    parser.add_argument("--output_path", type=str, help="Path to the output file", default="/mnt/data/model/ktrans-safetensors/DeepSeek-V3-q4km-fp8")
+    
+    # print all the arguments
+    print("All the arguments:")
+    print(parser.parse_args())
+    
+    # 解析命令行参数
+    args = parser.parse_args()
+
+    safetensor_path = args.safetensor_path
+    gguf_path = args.gguf_path
+    output_path = args.output_path
+    
+    target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path)
+    write_combined_tensor(target_tensor_map, output_path, gguf_loader)
+    
+    return
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/requirements-local_chat.txt
+++ b/requirements-local_chat.txt
@@ -4,4 +4,6 @@ numpy
 torch>=2.3.0
 packaging
 cpufeature
-protobuf
\ No newline at end of file
+protobuf
+tiktoken
+blobfile
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
 #!/usr/bin/env python
 # coding=utf-8
 '''
-Description  :  
+Description  :
 Author       : chenxl
 Date         : 2024-07-27 16:15:27
 Version      : 1.0.0
-LastEditors  : chenxl 
+LastEditors  : chenxl
 LastEditTime : 2024-08-14 16:36:19
 Adapted from:
 https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
 Copyright (c) 2023, Tri Dao.
-Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 '''

 import os
@@ -30,6 +30,11 @@ from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 from setuptools import setup, Extension
 from cpufeature.extension import CPUFeature
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+try:
+    from torch_musa.utils.simple_porting import SimplePorting
+    from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
+except ImportError:
+    MUSA_HOME=None

 class CpuInstructInfo:
    CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
@@ -40,7 +45,7 @@ class CpuInstructInfo:
    CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON"
    CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON"
    CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON"
-    
+
 class VersionInfo:
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))
    PACKAGE_NAME = "ktransformers"
@@ -49,6 +54,16 @@ class VersionInfo:
    )
    FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE"

+    def get_musa_bare_metal_version(self, musa_dir):
+        raw_output = subprocess.run(
+            [musa_dir + "/bin/mcc", "-v"], check=True,
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8")
+        output = raw_output.split()
+        release_idx = output.index("version") + 1
+        bare_metal_version = parse(output[release_idx].split(",")[0])
+        musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
+        return musa_version
+
    def get_cuda_bare_metal_version(self, cuda_dir):
        raw_output = subprocess.check_output(
            [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
@@ -58,7 +73,7 @@ class VersionInfo:
        cuda_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
        return cuda_version

-    def get_cuda_version_of_torch(self,):
+    def get_cuda_version_of_torch(self):
        torch_cuda_version = parse(torch.version.cuda)
        cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
        return cuda_version
@@ -117,7 +132,7 @@ class VersionInfo:
        torch_version_raw = parse(torch.__version__)
        torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}"
        return torch_version
-    
+
    def get_flash_version(self,):
        version_file = os.path.join(
            Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
@@ -128,12 +143,21 @@ class VersionInfo:
        return flash_version

    def get_package_version(self, full_version=False):
-        flash_version = self.get_flash_version()
-        package_version = f"{str(flash_version)}+cu{self.get_cuda_bare_metal_version(CUDA_HOME)}torch{self.get_torch_version()}{self.get_cpu_instruct()}"
+        flash_version = str(self.get_flash_version())
+        torch_version = self.get_torch_version()
+        cpu_instruct = self.get_cpu_instruct()
+        backend_version = ""
+        if CUDA_HOME is not None:
+            backend_version = f"cu{self.get_cuda_bare_metal_version(CUDA_HOME)}"
+        elif MUSA_HOME is not None:
+            backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
+        else:
+            raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
+        package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
        if full_version:
            return package_version
        if not VersionInfo.FORCE_BUILD:
-            return str(flash_version)
+            return flash_version
        return package_version


@@ -218,11 +242,19 @@ class CMakeBuild(BuildExtension):
            f"-DPYTHON_EXECUTABLE={sys.executable}",
            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
        ]
+
+        if CUDA_HOME is not None:
+            cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
+        elif MUSA_HOME is not None:
+            cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
+        else:
+            raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
+
        build_args = []
        if "CMAKE_ARGS" in os.environ:
            cmake_args += [
                item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
-            
+
        if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
            cpu_args = CpuInstructInfo.CMAKE_FANCY
        elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
@@ -231,7 +263,7 @@ class CMakeBuild(BuildExtension):
            cpu_args = CpuInstructInfo.CMAKE_AVX2
        else:
            cpu_args = CpuInstructInfo.CMAKE_NATIVE
-        
+
        cmake_args += [
            item for item in cpu_args.split(" ") if item
        ]
@@ -258,7 +290,7 @@ class CMakeBuild(BuildExtension):

            # CMake allows an arch-in-generator style for backward compatibility
            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
-            if not single_config and not contains_arch:
+            if not single_config and not contains_arch and cmake_generator:
                cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]

            # Multi-config generators have a different way to specify configs
@@ -276,8 +308,13 @@ class CMakeBuild(BuildExtension):
                    "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]

        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            cpu_count = os.cpu_count()
+            if cpu_count is None:
+                cpu_count = 1
            if hasattr(self, "parallel") and self.parallel:
-                build_args += [f"-j{self.parallel}"]
+                build_args += [f"--parallel={self.parallel}"]
+            else:
+                build_args += [f"--parallel={cpu_count}"]
        print("CMake args:", cmake_args)
        build_temp = Path(ext.sourcedir) / "build"
        if not build_temp.exists():
@@ -288,28 +325,56 @@ class CMakeBuild(BuildExtension):
        print("Standard output:", result.stdout)
        print("Standard error:", result.stderr)
        subprocess.run(
-            ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
+            ["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True
        )

+if CUDA_HOME is not None:
+    ops_module = CUDAExtension('KTransformersOps', [
+        'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
+        'ktransformers/ktransformers_ext/cuda/binding.cpp',
+        'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
+    ],
+    extra_compile_args={
+            'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
+            'nvcc': [
+                '-O3',
+                '--use_fast_math',
+                '-Xcompiler', '-fPIC',
+                '-DKTRANSFORMERS_USE_CUDA',
+            ]
+        }
+    )
+elif MUSA_HOME is not None:
+    SimplePorting(cuda_dir_path="ktransformers/ktransformers_ext/cuda", mapping_rule={
+        # Common rules
+        "at::cuda": "at::musa",
+        "#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
+        "#include <c10/cuda/CUDAGuard.h>": "#include \"torch_musa/csrc/core/MUSAGuard.h\"",
+        "nv_bfloat16": "mt_bfloat16",
+        }).run()
+    ops_module = MUSAExtension('KTransformersOps', [
+        'ktransformers/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
+        'ktransformers/ktransformers_ext/cuda_musa/binding.cpp',
+        # TODO: Add Marlin support for MUSA.
+        # 'ktransformers/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
+    ],
+    extra_compile_args={
+            'cxx': ['force_mcc'],
+            'mcc': [
+                '-O3',
+                '-DKTRANSFORMERS_USE_MUSA',
+                '-DTHRUST_IGNORE_CUB_VERSION_CHECK',
+            ]
+        }
+    )
+else:
+    raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")

 setup(
    version=VersionInfo().get_package_version(),
    cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
    ext_modules=[
        CMakeExtension("cpuinfer_ext"),
-        CUDAExtension('KTransformersOps', [
-            'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
-            'ktransformers/ktransformers_ext/cuda/binding.cpp',
-            'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
-        ],
-        extra_compile_args={
-                'cxx': ['-O3'],
-                'nvcc': [
-                    '-O3',
-                    '--use_fast_math',
-                    '-Xcompiler', '-fPIC',
-                ]
-            }
-        )
+        ops_module,
    ]
 )
--- a/test_prompt.txt
+++ b/test_prompt.txt
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that.When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,” chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar — a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes — the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt — these people were obviously collecting for something… yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.
-阅读以上文字，并概括大意