"tests/nn/vscode:/vscode.git/clone" did not exist on "7202115ebbc64edb4000bdd7eed8f276a556304e"
Unverified Commit 52fa671c authored by Yuhao Tsui's avatar Yuhao Tsui Committed by GitHub
Browse files

Merge branch 'kvcache-ai:main' into main

parents e5694f91 f142f4df
...@@ -147,7 +147,7 @@ ...@@ -147,7 +147,7 @@
name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$" name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
...@@ -157,7 +157,7 @@ ...@@ -157,7 +157,7 @@
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$" name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
...@@ -167,7 +167,7 @@ ...@@ -167,7 +167,7 @@
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$" name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:2" generate_device: "cuda:2"
prefill_device: "cuda:2" prefill_device: "cuda:2"
...@@ -177,7 +177,7 @@ ...@@ -177,7 +177,7 @@
name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$" name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:3" generate_device: "cuda:3"
prefill_device: "cuda:3" prefill_device: "cuda:3"
......
...@@ -278,7 +278,7 @@ ...@@ -278,7 +278,7 @@
name: "^model\\.layers\\.([0-7])\\.mlp\\.gate$" name: "^model\\.layers\\.([0-7])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
...@@ -288,7 +288,7 @@ ...@@ -288,7 +288,7 @@
name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.gate$" name: "^model\\.layers\\.(8|9|1[0-5])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
...@@ -298,7 +298,7 @@ ...@@ -298,7 +298,7 @@
name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.gate$" name: "^model\\.layers\\.(1[6-9]|2[0-3])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:2" generate_device: "cuda:2"
prefill_device: "cuda:2" prefill_device: "cuda:2"
...@@ -308,7 +308,7 @@ ...@@ -308,7 +308,7 @@
name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.gate$" name: "^model\\.layers\\.(2[4-9]|3[0-1])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:3" generate_device: "cuda:3"
prefill_device: "cuda:3" prefill_device: "cuda:3"
...@@ -318,7 +318,7 @@ ...@@ -318,7 +318,7 @@
name: "^model\\.layers\\.(3[2-9])\\.mlp\\.gate$" name: "^model\\.layers\\.(3[2-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:4" generate_device: "cuda:4"
prefill_device: "cuda:4" prefill_device: "cuda:4"
...@@ -328,7 +328,7 @@ ...@@ -328,7 +328,7 @@
name: "^model\\.layers\\.(4[0-7])\\.mlp\\.gate$" name: "^model\\.layers\\.(4[0-7])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:5" generate_device: "cuda:5"
prefill_device: "cuda:5" prefill_device: "cuda:5"
...@@ -338,7 +338,7 @@ ...@@ -338,7 +338,7 @@
name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.gate$" name: "^model\\.layers\\.(4[8-9]|5[0-5])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:6" generate_device: "cuda:6"
prefill_device: "cuda:6" prefill_device: "cuda:6"
...@@ -348,7 +348,7 @@ ...@@ -348,7 +348,7 @@
name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.gate$" name: "^model\\.layers\\.(5[6-9]|60)\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:7" generate_device: "cuda:7"
prefill_device: "cuda:7" prefill_device: "cuda:7"
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$" name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$" name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate # mlp module with custom forward function class: ktransformers.operators.gate.KMoEGateDeepSeekV3 # mlp module with custom forward function
kwargs: kwargs:
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$" name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$" name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate # mlp module with custom forward function class: ktransformers.operators.gate.KMoEGateDeepSeekV3 # mlp module with custom forward function
kwargs: kwargs:
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$" name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$" name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate # mlp module with custom forward function class: ktransformers.operators.gate.KMoEGateDeepSeekV3 # mlp module with custom forward function
kwargs: kwargs:
generate_device: "cuda:1" generate_device: "cuda:1"
prefill_device: "cuda:1" prefill_device: "cuda:1"
......
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
- match: - match:
class: ktransformers.models.modeling_deepseek_v3.MoEGate class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace: replace:
class: ktransformers.operators.gate.KMoEGate class: ktransformers.operators.gate.KMoEGateDeepSeekV3
kwargs: kwargs:
generate_device: "cuda:0" generate_device: "cuda:0"
prefill_device: "cuda:0" prefill_device: "cuda:0"
......
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^lm_head$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cpu"
prefill_device: "cuda"
generate_op: "KLinearCPUInfer"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearQ8"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\..*\\.mlp$"
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace:
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace:
class: ktransformers.operators.gate.KMoEGate
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda"
prefill_op: "KExpertsTorch"
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
\ No newline at end of file
import subprocess
import time
import requests
import sys
import os
def wait_for_server(base_url: str, timeout: int = None) -> None:
start_time = time.time()
while True:
try:
response = requests.get(
f"{base_url}/v1/models",
headers={"Authorization": "Bearer None"},
)
if response.status_code == 200:
print("Server is ready.")
break
except requests.exceptions.RequestException:
time.sleep(1)
if timeout and time.time() - start_time > timeout:
raise TimeoutError("Server did not become ready within timeout period")
server_cmd = [
"numactl", "-N", "1", "-m", "1",
"/home/qujing3/anaconda3/envs/ktransformers-dev/bin/ktransformers",
"--model_path", "/home/qujing3/models/DeepSeek-R1-Q4_K_M/config",
"--gguf_path", "/home/qujing3/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M",
"--port", "10002",
"--cpu_infer", "48",
"--optimize_config_path", "ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml",
"--max_new_tokens", "3000",
"--cache_lens", "6000"
]
print("Starting ktransformers server...")
print(" ".join(server_cmd))
with open("/tmp/server_log.txt", "w") as f:
server_process = subprocess.Popen(server_cmd, stdout=f, stderr=f, text=True)
try:
wait_for_server("http://localhost:10002", timeout=600)
eval_cmd = ["python", "ktransformers/tests/humaneval/eval_api.py"]
print("Running eval_api.py...")
print(f"Command: {' '.join(eval_cmd)}")
env = os.environ.copy()
env["PYTHONUNBUFFERED"] = "1"
eval_process = subprocess.Popen(
eval_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1,
env=env,
universal_newlines=True
)
import threading
import queue
def enqueue_output(out, queue):
for line in iter(out.readline, ''):
queue.put(line)
out.close()
stdout_queue = queue.Queue()
stderr_queue = queue.Queue()
stdout_thread = threading.Thread(target=enqueue_output, args=(eval_process.stdout, stdout_queue))
stderr_thread = threading.Thread(target=enqueue_output, args=(eval_process.stderr, stderr_queue))
stdout_thread.daemon = True
stderr_thread.daemon = True
stdout_thread.start()
stderr_thread.start()
while eval_process.poll() is None:
try:
line = stdout_queue.get_nowait()
print(line, end='', flush=True)
except queue.Empty:
pass
try:
line = stderr_queue.get_nowait()
print(line, end='', file=sys.stderr, flush=True)
except queue.Empty:
pass
time.sleep(1)
while not stdout_queue.empty():
print(stdout_queue.get(), end='', flush=True)
while not stderr_queue.empty():
print(stderr_queue.get(), end='', file=sys.stderr, flush=True)
eval_process.wait()
print(f"eval_api.py completed with exit code: {eval_process.returncode}")
evaluate_cmd = [
"evaluate_functional_correctness",
"ktransformers/tests/humaneval/results/api/eval_b.jsonl"
]
print("Running evaluate_functional_correctness...")
print(f"Command: {' '.join(evaluate_cmd)}")
evaluate_process = subprocess.Popen(
evaluate_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1,
universal_newlines=True
)
for line in evaluate_process.stdout:
print(line, end='', flush=True)
for line in evaluate_process.stderr:
print(line, end='', file=sys.stderr, flush=True)
evaluate_process.wait()
print(f"evaluate_functional_correctness completed with exit code: {evaluate_process.returncode}")
if evaluate_process.returncode != 0:
print(f"evaluate_functional_correctness exited with code {evaluate_process.returncode}")
sys.exit(evaluate_process.returncode)
finally:
print("Stopping ktransformers server...")
server_process.terminate()
try:
server_process.wait(timeout=30)
except subprocess.TimeoutExpired:
print("Server did not terminate gracefully, forcing...")
server_process.kill()
\ No newline at end of file
import torch
# 定义一个包含线性层的浮点模型
class LinearModel(torch.nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.linear = torch.nn.Linear(in_features, out_features)
def forward(self, x):
return self.linear(x)
# 创建浮点模型实例
in_features = 64
out_features = 128
model_fp32 = LinearModel(in_features, out_features)
# 创建量化模型实例
model_int8 = torch.ao.quantization.quantize_dynamic(
model_fp32, # 原始浮点模型
{torch.nn.Linear}, # 要量化的层类型集合
dtype=torch.qint8 # 量化的目标数据类型
)
# 测试模型
batch_size = 32
input_fp32 = torch.randn(1, batch_size, in_features) # 生成随机输入数据
output_int8 = model_int8(input_fp32) # 通过量化模型运行数据
# 打印输出形状验证
print(f"输入形状: {input_fp32.shape}")
print(f"输出形状: {output_int8.shape}")
# 比较原始模型和量化模型的输出
with torch.no_grad():
output_fp32 = model_fp32(input_fp32)
print(f"FP32输出的前几个值: {output_fp32[0, :5]}")
print(f"INT8输出的前几个值: {output_int8[0, :5]}")
# 计算平均误差
error = torch.abs(output_fp32 - output_int8).mean().item()
print(f"平均绝对误差: {error}")
# 打印模型类型信息
print(f"量化前模型类型: {type(model_fp32.linear)}")
print(f"量化后模型类型: {type(model_int8.linear)}")
\ No newline at end of file
from __future__ import annotations
from enum import IntEnum, auto
from typing import Optional, Union, List
import torch
class GPUVendor(IntEnum):
NVIDIA = auto()
AMD = auto()
MooreThreads = auto()
MetaX = auto()
MUSA = auto()
Unknown = auto()
class DeviceManager:
"""
Device manager that provides a unified interface for handling different GPU vendors
"""
def __init__(self):
self.gpu_vendor = self._detect_gpu_vendor()
self.available_devices = self._get_available_devices()
def _detect_gpu_vendor(self) -> GPUVendor:
"""Detect GPU vendor type"""
if not torch.cuda.is_available():
# Check MUSA availability (assuming a musa module exists)
try:
import musa
if musa.is_available():
return GPUVendor.MUSA
except (ImportError, AttributeError):
pass
return GPUVendor.Unknown
device_name = torch.cuda.get_device_name(0).lower()
if any(name in device_name for name in ["nvidia", "geforce", "quadro", "tesla", "titan", "rtx", "gtx"]):
return GPUVendor.NVIDIA
elif any(name in device_name for name in ["amd", "radeon", "rx", "vega", "instinct", "firepro", "mi"]):
return GPUVendor.AMD
elif any(name in device_name for name in ["mthreads", "moore", "mtt"]):
return GPUVendor.MooreThreads
elif any(name in device_name for name in ["metax", "meta"]):
return GPUVendor.MetaX
elif "musa" in device_name:
return GPUVendor.MUSA
# Backend check
try:
if hasattr(torch.version, 'hip') and torch.version.hip is not None:
return GPUVendor.AMD
elif hasattr(torch.version, 'cuda') and torch.version.cuda is not None:
return GPUVendor.NVIDIA
except:
pass
return GPUVendor.Unknown
def _get_available_devices(self) -> List[int]:
"""Get list of available device indices"""
devices = []
if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
devices = list(range(torch.cuda.device_count()))
elif self.gpu_vendor == GPUVendor.MUSA:
try:
import musa
devices = list(range(musa.device_count()))
except (ImportError, AttributeError):
pass
return devices
def get_device_str(self, device_id: Union[int, str]) -> str:
"""
Get device string for the given device ID
Args:
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
Device string representation (e.g., "cuda:0", "musa:1", "cpu")
"""
if device_id == -1 or device_id == "cpu":
return "cpu"
if isinstance(device_id, int):
if self.gpu_vendor == GPUVendor.NVIDIA or self.gpu_vendor == GPUVendor.AMD:
if device_id < torch.cuda.device_count():
return f"cuda:{device_id}"
elif self.gpu_vendor == GPUVendor.MUSA:
try:
import musa
if device_id < musa.device_count():
return f"musa:{device_id}"
except (ImportError, AttributeError):
pass
return "cpu"
def to_torch_device(self, device_id: Union[int, str] = 0) -> torch.device:
"""
Convert device ID to torch.device object
Args:
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
torch.device object
"""
device_str = self.get_device_str(device_id)
# Handle MUSA device
if device_str.startswith("musa:"):
try:
import musa
index = int(device_str.split(":")[-1])
return musa.device(index)
except (ImportError, ValueError, AttributeError):
return torch.device("cpu")
# Standard PyTorch device
return torch.device(device_str)
def move_tensor_to_device(self, tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
"""
Move tensor to specified device
Args:
tensor: PyTorch tensor to move
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
Tensor moved to the specified device
"""
device = self.to_torch_device(device_id)
return tensor.to(device)
def is_available(self, index: int = 0) -> bool:
"""
Check if device at specified index is available
Args:
index: Device index to check
Returns:
True if the device is available, False otherwise
"""
if index < 0:
return True # CPU is always available
return index in self.available_devices
def get_all_devices(self) -> List[int]:
"""
Get all available device indices
Returns:
List of available device indices (0, 1, 2, etc.)
"""
return self.available_devices
# Create global device manager instance
device_manager = DeviceManager()
# Convenience functions
def get_device(device_id: Union[int, str] = 0) -> torch.device:
"""
Get torch.device object for the specified device ID
Args:
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
torch.device object
"""
return device_manager.to_torch_device(device_id)
def to_device(tensor: torch.Tensor, device_id: Union[int, str] = 0) -> torch.Tensor:
"""
Move tensor to specified device
Args:
tensor: PyTorch tensor to move
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
Tensor moved to the specified device
"""
return device_manager.move_tensor_to_device(tensor, device_id)
# Get devices
cpu_device = get_device(-1) # CPU using index -1
cpu_device2 = get_device("cpu") # CPU using string "cpu"
gpu0 = get_device(0) # First GPU
# Move tensors
x = torch.randn(3, 3)
x_gpu = to_device(x, 0) # Move to first GPU
x_cpu1 = to_device(x, -1) # Move to CPU using index -1
x_cpu2 = to_device(x, "cpu") # Move to CPU using string "cpu"
\ No newline at end of file
...@@ -29,7 +29,7 @@ import torch.version ...@@ -29,7 +29,7 @@ import torch.version
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
from setuptools import setup, Extension from setuptools import setup, Extension
from cpufeature.extension import CPUFeature from cpufeature.extension import CPUFeature
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
try: try:
from torch_musa.utils.simple_porting import SimplePorting from torch_musa.utils.simple_porting import SimplePorting
from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
...@@ -64,6 +64,70 @@ class VersionInfo: ...@@ -64,6 +64,70 @@ class VersionInfo:
musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}" musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
return musa_version return musa_version
def get_rocm_bare_metal_version(self, rocm_dir):
"""
Get the ROCm version from the ROCm installation directory.
Args:
rocm_dir: Path to the ROCm installation directory
Returns:
A string representation of the ROCm version (e.g., "63" for ROCm 6.3)
"""
try:
# Try using rocm_agent_enumerator to get version info
raw_output = subprocess.check_output(
[rocm_dir + "/bin/rocminfo", "--version"],
universal_newlines=True,
stderr=subprocess.STDOUT)
# Extract version number from output
match = re.search(r'(\d+\.\d+)', raw_output)
if match:
version_str = match.group(1)
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
except (subprocess.CalledProcessError, FileNotFoundError):
# If rocminfo --version fails, try alternative methods
pass
try:
# Try reading version from release file
with open(os.path.join(rocm_dir, "share/doc/hip/version.txt"), "r") as f:
version_str = f.read().strip()
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
except (FileNotFoundError, IOError):
pass
# If all else fails, try to extract from directory name
dir_name = os.path.basename(os.path.normpath(rocm_dir))
match = re.search(r'rocm-(\d+\.\d+)', dir_name)
if match:
version_str = match.group(1)
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
# Fallback to extracting from hipcc version
try:
raw_output = subprocess.check_output(
[rocm_dir + "/bin/hipcc", "--version"],
universal_newlines=True,
stderr=subprocess.STDOUT)
match = re.search(r'HIP version: (\d+\.\d+)', raw_output)
if match:
version_str = match.group(1)
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
except (subprocess.CalledProcessError, FileNotFoundError):
pass
# If we still can't determine the version, raise an error
raise ValueError(f"Could not determine ROCm version from directory: {rocm_dir}")
def get_cuda_bare_metal_version(self, cuda_dir): def get_cuda_bare_metal_version(self, cuda_dir):
raw_output = subprocess.check_output( raw_output = subprocess.check_output(
[cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
...@@ -148,11 +212,13 @@ class VersionInfo: ...@@ -148,11 +212,13 @@ class VersionInfo:
cpu_instruct = self.get_cpu_instruct() cpu_instruct = self.get_cpu_instruct()
backend_version = "" backend_version = ""
if CUDA_HOME is not None: if CUDA_HOME is not None:
backend_version = f"cu{self.get_cuda_bare_metal_version(CUDA_HOME)}" backend_version = f""
elif MUSA_HOME is not None: elif MUSA_HOME is not None:
backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}" backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
elif ROCM_HOME is not None:
backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
else: else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.") raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set.")
package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}" package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
if full_version: if full_version:
return package_version return package_version
...@@ -247,9 +313,13 @@ class CMakeBuild(BuildExtension): ...@@ -247,9 +313,13 @@ class CMakeBuild(BuildExtension):
cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"] cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
elif MUSA_HOME is not None: elif MUSA_HOME is not None:
cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"] cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
elif ROCM_HOME is not None:
cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
else: else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.") raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
# log cmake_args
print("CMake args:", cmake_args)
build_args = [] build_args = []
if "CMAKE_ARGS" in os.environ: if "CMAKE_ARGS" in os.environ:
cmake_args += [ cmake_args += [
...@@ -328,7 +398,7 @@ class CMakeBuild(BuildExtension): ...@@ -328,7 +398,7 @@ class CMakeBuild(BuildExtension):
["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True ["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True
) )
if CUDA_HOME is not None: if CUDA_HOME is not None or ROCM_HOME is not None:
ops_module = CUDAExtension('KTransformersOps', [ ops_module = CUDAExtension('KTransformersOps', [
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu', 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
'ktransformers/ktransformers_ext/cuda/binding.cpp', 'ktransformers/ktransformers_ext/cuda/binding.cpp',
...@@ -338,7 +408,7 @@ if CUDA_HOME is not None: ...@@ -338,7 +408,7 @@ if CUDA_HOME is not None:
'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'], 'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
'nvcc': [ 'nvcc': [
'-O3', '-O3',
'--use_fast_math', # '--use_fast_math',
'-Xcompiler', '-fPIC', '-Xcompiler', '-fPIC',
'-DKTRANSFORMERS_USE_CUDA', '-DKTRANSFORMERS_USE_CUDA',
] ]
...@@ -371,6 +441,7 @@ else: ...@@ -371,6 +441,7 @@ else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.") raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
setup( setup(
name=VersionInfo.PACKAGE_NAME,
version=VersionInfo().get_package_version(), version=VersionInfo().get_package_version(),
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
ext_modules=[ ext_modules=[
......
...@@ -2385,7 +2385,12 @@ struct SimpleBits { ...@@ -2385,7 +2385,12 @@ struct SimpleBits {
__m256i values[4]; __m256i values[4];
}; };
// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
#define HAVE_AVX512_POPCNT 1
#else
#define HAVE_AVX512_POPCNT 0
#endif
struct EvenSignHelper { struct EvenSignHelper {
#if defined HAVE_FANCY_SIMD #if defined HAVE_FANCY_SIMD
...@@ -2396,7 +2401,23 @@ struct EvenSignHelper { ...@@ -2396,7 +2401,23 @@ struct EvenSignHelper {
}; };
IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const { IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask); aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
auto pcnt = _mm256_popcnt_epi32(aux);
// fix for #829: 兼容Intel Cascade Lake架构的CPU,如果不支持AVX512VPOPCNTDQ扩展,则使用替代实现
#if HAVE_AVX512_POPCNT
auto pcnt = _mm256_popcnt_epi32(aux);
#else
// 提供替代实现,使用标准的位计数方法
__m256i pcnt;
int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址,避免不必要的复制
#pragma unroll 8 // 提示编译器展开循环,提高 SIMD 计算吞吐量
for (int i = 0; i < 8; i++) {
pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
}
#endif
sbits_t sbits; sbits_t sbits;
sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]); values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment