Merge branch 'kvcache-ai:main' into main

52fa671c · Yuhao Tsui · GitHub · e5694f91 · f142f4df · 52fa671c
Unverified Commit 52fa671c authored Mar 26, 2025 by Yuhao Tsui Committed by GitHub Mar 26, 2025
12 changed files
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
@@ -147,7 +147,7 @@
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
@@ -157,7 +157,7 @@
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
@@ -167,7 +167,7 @@
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
@@ -177,7 +177,7 @@
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
@@ -278,7 +278,7 @@
    name: "^model\\.layers\\.([0-7])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
@@ -288,7 +288,7 @@
    name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
@@ -298,7 +298,7 @@
    name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:2"
      prefill_device: "cuda:2"
@@ -308,7 +308,7 @@
    name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:3"
      prefill_device: "cuda:3"
@@ -318,7 +318,7 @@
    name: "^model\\.layers\\.(3[2-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:4"
      prefill_device: "cuda:4"
@@ -328,7 +328,7 @@
    name: "^model\\.layers\\.(4[0-7])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:5"
      prefill_device: "cuda:5"
@@ -338,7 +338,7 @@
    name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:6"
      prefill_device: "cuda:6"
@@ -348,7 +348,7 @@
    name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:7"
      prefill_device: "cuda:7"

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
@@ -66,7 +66,7 @@
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
@@ -74,7 +74,7 @@
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
@@ -66,7 +66,7 @@
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
@@ -74,7 +74,7 @@
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
@@ -66,7 +66,7 @@
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
@@ -74,7 +74,7 @@
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3     # mlp module with custom forward function
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
@@ -38,7 +38,7 @@
 - match:
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
  replace:
-    class: ktransformers.operators.gate.KMoEGate
+    class: ktransformers.operators.gate.KMoEGateDeepSeekV3
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

--- a/ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+
+- match:
+    name: "^lm_head$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cuda"
+      generate_op: "KLinearCPUInfer"
+      prefill_op: "KLinearTorch"
+
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearQ8"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
\ No newline at end of file
--- a/ktransformers/tests/score.py
+++ b/ktransformers/tests/score.py
+import subprocess
+import time
+import requests
+import sys
+import os
+
+def wait_for_server(base_url: str, timeout: int = None) -> None:
+    start_time = time.time()
+    while True:
+        try:
+            response = requests.get(
+                f"{base_url}/v1/models",
+                headers={"Authorization": "Bearer None"},
+            )
+            if response.status_code == 200:
+                print("Server is ready.")
+                break
+        except requests.exceptions.RequestException:
+            time.sleep(1)
+            if timeout and time.time() - start_time > timeout:
+                raise TimeoutError("Server did not become ready within timeout period")
+
+server_cmd = [
+    "numactl", "-N", "1", "-m", "1",
+    "/home/qujing3/anaconda3/envs/ktransformers-dev/bin/ktransformers",
+    "--model_path", "/home/qujing3/models/DeepSeek-R1-Q4_K_M/config",
+    "--gguf_path", "/home/qujing3/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M",
+    "--port", "10002",
+    "--cpu_infer", "48",
+    "--optimize_config_path", "ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml",
+    "--max_new_tokens", "3000",
+    "--cache_lens", "6000"
+]
+
+print("Starting ktransformers server...")
+print(" ".join(server_cmd))
+with open("/tmp/server_log.txt", "w") as f:
+    server_process = subprocess.Popen(server_cmd, stdout=f, stderr=f, text=True)
+
+try:
+    wait_for_server("http://localhost:10002", timeout=600)
+
+    eval_cmd = ["python", "ktransformers/tests/humaneval/eval_api.py"]
+    print("Running eval_api.py...")
+    print(f"Command: {' '.join(eval_cmd)}")
+    
+    env = os.environ.copy()
+    env["PYTHONUNBUFFERED"] = "1"
+    
+    eval_process = subprocess.Popen(
+        eval_cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1,
+        env=env,
+        universal_newlines=True
+    )
+    
+    import threading
+    import queue
+    
+    def enqueue_output(out, queue):
+        for line in iter(out.readline, ''):
+            queue.put(line)
+        out.close()
+    
+    stdout_queue = queue.Queue()
+    stderr_queue = queue.Queue()
+    
+    stdout_thread = threading.Thread(target=enqueue_output, args=(eval_process.stdout, stdout_queue))
+    stderr_thread = threading.Thread(target=enqueue_output, args=(eval_process.stderr, stderr_queue))
+    
+    stdout_thread.daemon = True
+    stderr_thread.daemon = True
+    stdout_thread.start()
+    stderr_thread.start()
+    
+    while eval_process.poll() is None:
+        try:
+            line = stdout_queue.get_nowait()
+            print(line, end='', flush=True)
+        except queue.Empty:
+            pass
+            
+        try:
+            line = stderr_queue.get_nowait()
+            print(line, end='', file=sys.stderr, flush=True)
+        except queue.Empty:
+            pass
+        
+        time.sleep(1)
+
+    while not stdout_queue.empty():
+        print(stdout_queue.get(), end='', flush=True)
+    while not stderr_queue.empty():
+        print(stderr_queue.get(), end='', file=sys.stderr, flush=True)
+        
+    eval_process.wait()
+    print(f"eval_api.py completed with exit code: {eval_process.returncode}")
+
+    evaluate_cmd = [
+        "evaluate_functional_correctness",
+        "ktransformers/tests/humaneval/results/api/eval_b.jsonl"
+    ]
+    print("Running evaluate_functional_correctness...")
+    print(f"Command: {' '.join(evaluate_cmd)}")
+    
+    evaluate_process = subprocess.Popen(
+        evaluate_cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1,
+        universal_newlines=True
+    )
+    
+    for line in evaluate_process.stdout:
+        print(line, end='', flush=True)
+    for line in evaluate_process.stderr:
+        print(line, end='', file=sys.stderr, flush=True)
+        
+    evaluate_process.wait()
+    
+    print(f"evaluate_functional_correctness completed with exit code: {evaluate_process.returncode}")
+    if evaluate_process.returncode != 0:
+        print(f"evaluate_functional_correctness exited with code {evaluate_process.returncode}")
+        sys.exit(evaluate_process.returncode)
+
+finally:
+    print("Stopping ktransformers server...")
+    server_process.terminate()
+    try:
+        server_process.wait(timeout=30)
+    except subprocess.TimeoutExpired:
+        print("Server did not terminate gracefully, forcing...")
+        server_process.kill()
\ No newline at end of file
--- a/ktransformers/tests/test_pytorch_q8.py
+++ b/ktransformers/tests/test_pytorch_q8.py
+import torch
+
+# 定义一个包含线性层的浮点模型
+class LinearModel(torch.nn.Module):
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.linear = torch.nn.Linear(in_features, out_features)
+    
+    def forward(self, x):
+        return self.linear(x)
+
+# 创建浮点模型实例
+in_features = 64
+out_features = 128
+model_fp32 = LinearModel(in_features, out_features)
+
+# 创建量化模型实例
+model_int8 = torch.ao.quantization.quantize_dynamic(
+    model_fp32,          # 原始浮点模型
+    {torch.nn.Linear},   # 要量化的层类型集合
+    dtype=torch.qint8    # 量化的目标数据类型
+)
+
+# 测试模型
+batch_size = 32
+input_fp32 = torch.randn(1, batch_size, in_features)  # 生成随机输入数据
+output_int8 = model_int8(input_fp32)               # 通过量化模型运行数据
+
+# 打印输出形状验证
+print(f"输入形状: {input_fp32.shape}")
+print(f"输出形状: {output_int8.shape}")
+
+# 比较原始模型和量化模型的输出
+with torch.no_grad():
+    output_fp32 = model_fp32(input_fp32)
+    
+print(f"FP32输出的前几个值: {output_fp32[0, :5]}")
+print(f"INT8输出的前几个值: {output_int8[0, :5]}")
+
+# 计算平均误差
+error = torch.abs(output_fp32 - output_int8).mean().item()
+print(f"平均绝对误差: {error}")
+
+# 打印模型类型信息
+print(f"量化前模型类型: {type(model_fp32.linear)}")
+print(f"量化后模型类型: {type(model_int8.linear)}")
\ No newline at end of file
--- a/ktransformers/util/vendors.py
+++ b/ktransformers/util/vendors.py
+from __future__ import annotations
+
+from enum import IntEnum, auto
+from typing import Optional, Union, List
+import torch
+
+class GPUVendor(IntEnum):
+    NVIDIA = auto()
+    AMD = auto()
+    MooreThreads = auto()
+    MetaX = auto()
+    MUSA = auto()
+    Unknown = auto()
+
+class DeviceManager:
+    """
+    Device manager that provides a unified interface for handling different GPU vendors
+    """
+    def __init__(self):
+        self.gpu_vendor = self._detect_gpu_vendor()
+        self.available_devices = self._get_available_devices()
+    
+    def _detect_gpu_vendor(self) -> GPUVendor:
+        """Detect GPU vendor type"""
+        if not torch.cuda.is_available():
+            # Check MUSA availability (assuming a musa module exists)
+            try:
+                import musa
+                if musa.is_available():
+                    return GPUVendor.MUSA
+            except (ImportError, AttributeError):
+                pass
+            
+            return GPUVendor.Unknown
+        
+        device_name = torch.cuda.get_device_name(0).lower()
+        
+        if any(name in device_name for name in ["nvidia", "geforce", "quadro", "tesla", "titan", "rtx", "gtx"]):
+            return GPUVendor.NVIDIA
+        elif any(name in device_name for name in ["amd", "radeon", "rx", "vega", "instinct", "firepro", "mi"]):
+            return GPUVendor.AMD
+        elif any(name in device_name for name in ["mthreads", "moore", "mtt"]):
+            return GPUVendor.MooreThreads
+        elif any(name in device_name for name in ["metax", "meta"]):
+            return GPUVendor.MetaX
+        elif "musa" in device_name:
+            return GPUVendor.MUSA
+        
+        # Backend check
+        try:
+            if hasattr(torch.version, 'hip') and torch.version.hip is not None:
+                return GPUVendor.AMD
+            elif hasattr(torch.version, 'cuda') and torch.version.cuda is not None:
+                return GPUVendor.NVIDIA
+        except:
+            pass
+            
+        return GPUVendor.Unknown
+    
+    def _get_available_devices(self) -> List[int]:
+        """Get list of available device indices"""
+        devices = []
+        
+        if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
+            devices = list(range(torch.cuda.device_count()))
+        elif self.gpu_vendor == GPUVendor.MUSA:
+            try:
+                import musa
+                devices = list(range(musa.device_count()))
+            except (ImportError, AttributeError):
+                pass
+            
+        return devices
+    
+    def get_device_str(self, device_id: Union[int, str]) -> str:
+        """
+        Get device string for the given device ID
+        
+        Args:
+            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
+            
+        Returns:
+            Device string representation (e.g., "cuda:0", "musa:1", "cpu")
+        """
+        if device_id == -1 or device_id == "cpu":
+            return "cpu"
+            
+        if isinstance(device_id, int):
+            if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
+                if device_id < torch.cuda.device_count():
+                    return f"cuda:{device_id}"
+            elif self.gpu_vendor == GPUVendor.MUSA:
+                try:
+                    import musa
+                    if device_id < musa.device_count():
+                        return f"musa:{device_id}"
+                except (ImportError, AttributeError):
+                    pass
+        
+        return "cpu"
+    
+    def to_torch_device(self, device_id: Union[int, str] = 0) -> torch.device:
+        """
+        Convert device ID to torch.device object
+        
+        Args:
+            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
+            
+        Returns:
+            torch.device object
+        """
+        device_str = self.get_device_str(device_id)
+        
+        # Handle MUSA device
+        if device_str.startswith("musa:"):
+            try:
+                import musa
+                index = int(device_str.split(":")[-1])
+                return musa.device(index)
+            except (ImportError, ValueError, AttributeError):
+                return torch.device("cpu")
+        
+        # Standard PyTorch device
+        return torch.device(device_str)
+    
+    def move_tensor_to_device(self, tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
+        """
+        Move tensor to specified device
+        
+        Args:
+            tensor: PyTorch tensor to move
+            device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
+            
+        Returns:
+            Tensor moved to the specified device
+        """
+        device = self.to_torch_device(device_id)
+        return tensor.to(device)
+    
+    def is_available(self, index: int = 0) -> bool:
+        """
+        Check if device at specified index is available
+        
+        Args:
+            index: Device index to check
+            
+        Returns:
+            True if the device is available, False otherwise
+        """
+        if index < 0:
+            return True  # CPU is always available
+            
+        return index in self.available_devices
+    
+    def get_all_devices(self) -> List[int]:
+        """
+        Get all available device indices
+        
+        Returns:
+            List of available device indices (0, 1, 2, etc.)
+        """
+        return self.available_devices
+
+# Create global device manager instance
+device_manager = DeviceManager()
+
+# Convenience functions
+def get_device(device_id: Union[int, str] = 0) -> torch.device:
+    """
+    Get torch.device object for the specified device ID
+    
+    Args:
+        device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
+        
+    Returns:
+        torch.device object
+    """
+    return device_manager.to_torch_device(device_id)
+
+def to_device(tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
+    """
+    Move tensor to specified device
+    
+    Args:
+        tensor: PyTorch tensor to move
+        device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
+        
+    Returns:
+        Tensor moved to the specified device
+    """
+    return device_manager.move_tensor_to_device(tensor, device_id)
+
+# Get devices
+cpu_device = get_device(-1)        # CPU using index -1
+cpu_device2 = get_device("cpu")    # CPU using string "cpu"
+gpu0 = get_device(0)               # First GPU
+
+# Move tensors
+x = torch.randn(3, 3)
+x_gpu = to_device(x, 0)            # Move to first GPU
+x_cpu1 = to_device(x, -1)          # Move to CPU using index -1
+x_cpu2 = to_device(x, "cpu")       # Move to CPU using string "cpu"
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ import torch.version
 from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 from setuptools import setup, Extension
 from cpufeature.extension import CPUFeature
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
 try:
    from torch_musa.utils.simple_porting import SimplePorting
    from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
@@ -64,6 +64,70 @@ class VersionInfo:
        musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
        return musa_version

+    def get_rocm_bare_metal_version(self, rocm_dir):
+        """
+        Get the ROCm version from the ROCm installation directory.
+        
+        Args:
+            rocm_dir: Path to the ROCm installation directory
+        
+        Returns:
+            A string representation of the ROCm version (e.g., "63" for ROCm 6.3)
+        """
+        try:
+            # Try using rocm_agent_enumerator to get version info
+            raw_output = subprocess.check_output(
+                [rocm_dir + "/bin/rocminfo", "--version"], 
+                universal_newlines=True,
+                stderr=subprocess.STDOUT)
+            # Extract version number from output
+            match = re.search(r'(\d+\.\d+)', raw_output)
+            if match:
+                version_str = match.group(1)
+                version = parse(version_str)
+                rocm_version = f"{version.major}{version.minor}"
+                return rocm_version
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            # If rocminfo --version fails, try alternative methods
+            pass
+        
+        try:
+            # Try reading version from release file
+            with open(os.path.join(rocm_dir, "share/doc/hip/version.txt"), "r") as f:
+                version_str = f.read().strip()
+                version = parse(version_str)
+                rocm_version = f"{version.major}{version.minor}"
+                return rocm_version
+        except (FileNotFoundError, IOError):
+            pass
+        
+        # If all else fails, try to extract from directory name
+        dir_name = os.path.basename(os.path.normpath(rocm_dir))
+        match = re.search(r'rocm-(\d+\.\d+)', dir_name)
+        if match:
+            version_str = match.group(1)
+            version = parse(version_str)
+            rocm_version = f"{version.major}{version.minor}"
+            return rocm_version
+        
+        # Fallback to extracting from hipcc version
+        try:
+            raw_output = subprocess.check_output(
+                [rocm_dir + "/bin/hipcc", "--version"],
+                universal_newlines=True,
+                stderr=subprocess.STDOUT)
+            match = re.search(r'HIP version: (\d+\.\d+)', raw_output)
+            if match:
+                version_str = match.group(1)
+                version = parse(version_str)
+                rocm_version = f"{version.major}{version.minor}"
+                return rocm_version
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            pass
+        
+        # If we still can't determine the version, raise an error
+        raise ValueError(f"Could not determine ROCm version from directory: {rocm_dir}")
+
    def get_cuda_bare_metal_version(self, cuda_dir):
        raw_output = subprocess.check_output(
            [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
@@ -148,11 +212,13 @@ class VersionInfo:
        cpu_instruct = self.get_cpu_instruct()
        backend_version = ""
        if CUDA_HOME is not None:
-            backend_version = f"cu{self.get_cuda_bare_metal_version(CUDA_HOME)}"
+            backend_version = f""
        elif MUSA_HOME is not None:
            backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
+        elif ROCM_HOME is not None:
+            backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
        else:
-            raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
+            raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set.")
        package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
        if full_version:
            return package_version
@@ -247,9 +313,13 @@ class CMakeBuild(BuildExtension):
            cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
        elif MUSA_HOME is not None:
            cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
+        elif ROCM_HOME is not None:
+            cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
        else:
            raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
-
+        # log cmake_args
+        print("CMake args:", cmake_args)
+        
        build_args = []
        if "CMAKE_ARGS" in os.environ:
            cmake_args += [
@@ -328,7 +398,7 @@ class CMakeBuild(BuildExtension):
            ["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True
        )

-if CUDA_HOME is not None:
+if CUDA_HOME is not None or ROCM_HOME is not None:
    ops_module = CUDAExtension('KTransformersOps', [
        'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
        'ktransformers/ktransformers_ext/cuda/binding.cpp',
@@ -338,7 +408,7 @@ if CUDA_HOME is not None:
            'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
            'nvcc': [
                '-O3',
-                '--use_fast_math',
+                # '--use_fast_math',
                '-Xcompiler', '-fPIC',
                '-DKTRANSFORMERS_USE_CUDA',
            ]
@@ -371,6 +441,7 @@ else:
    raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")

 setup(
+    name=VersionInfo.PACKAGE_NAME,
    version=VersionInfo().get_package_version(),
    cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
    ext_modules=[

--- a/third_party/llamafile/iqk_mul_mat.inc
+++ b/third_party/llamafile/iqk_mul_mat.inc
@@ -2385,7 +2385,12 @@ struct SimpleBits {
    __m256i values[4];
 };

-
+// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
+#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
+#define HAVE_AVX512_POPCNT 1
+#else
+#define HAVE_AVX512_POPCNT 0
+#endif

 struct EvenSignHelper {
    #if defined HAVE_FANCY_SIMD
@@ -2396,7 +2401,23 @@ struct EvenSignHelper {
        };
        IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
            aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
-            auto pcnt = _mm256_popcnt_epi32(aux);
+            
+            // fix for #829: 兼容Intel Cascade Lake架构的CPU，如果不支持AVX512VPOPCNTDQ扩展，则使用替代实现
+            #if HAVE_AVX512_POPCNT
+                auto pcnt = _mm256_popcnt_epi32(aux);
+                
+            #else
+                // 提供替代实现，使用标准的位计数方法
+                __m256i pcnt;
+                int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
+                int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址，避免不必要的复制
+                
+                #pragma unroll 8  // 提示编译器展开循环，提高 SIMD 计算吞吐量
+                for (int i = 0; i < 8; i++) {
+                    pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
+                }
+            #endif
+            
            sbits_t sbits;
            sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
            values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);