Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
52fa671c
Unverified
Commit
52fa671c
authored
Mar 26, 2025
by
Yuhao Tsui
Committed by
GitHub
Mar 26, 2025
Browse files
Merge branch 'kvcache-ai:main' into main
parents
e5694f91
f142f4df
Changes
52
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
580 additions
and
27 deletions
+580
-27
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
...optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
+4
-4
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
...optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
+8
-8
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
...s/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+2
-2
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
...ize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+2
-2
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
...s/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
+2
-2
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+1
-1
ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml
...ormers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml
+76
-0
ktransformers/tests/score.py
ktransformers/tests/score.py
+137
-0
ktransformers/tests/test_pytorch_q8.py
ktransformers/tests/test_pytorch_q8.py
+46
-0
ktransformers/util/vendors.py
ktransformers/util/vendors.py
+202
-0
setup.py
setup.py
+77
-6
third_party/llamafile/iqk_mul_mat.inc
third_party/llamafile/iqk_mul_mat.inc
+23
-2
No files found.
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
View file @
52fa671c
...
...
@@ -147,7 +147,7 @@
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
...
...
@@ -157,7 +157,7 @@
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
...
...
@@ -167,7 +167,7 @@
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
...
...
@@ -177,7 +177,7 @@
name
:
"
^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml
View file @
52fa671c
...
...
@@ -278,7 +278,7 @@
name
:
"
^model
\\
.layers
\\
.([0-7])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
...
...
@@ -288,7 +288,7 @@
name
:
"
^model
\\
.layers
\\
.(8|9|1[0-5])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
...
...
@@ -298,7 +298,7 @@
name
:
"
^model
\\
.layers
\\
.(1[6-9]|2[0-3])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
...
...
@@ -308,7 +308,7 @@
name
:
"
^model
\\
.layers
\\
.(2[4-9]|3[0-1])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
...
...
@@ -318,7 +318,7 @@
name
:
"
^model
\\
.layers
\\
.(3[2-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:4"
prefill_device
:
"
cuda:4"
...
...
@@ -328,7 +328,7 @@
name
:
"
^model
\\
.layers
\\
.(4[0-7])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:5"
prefill_device
:
"
cuda:5"
...
...
@@ -338,7 +338,7 @@
name
:
"
^model
\\
.layers
\\
.(4[8-9]|5[0-5])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:6"
prefill_device
:
"
cuda:6"
...
...
@@ -348,7 +348,7 @@
name
:
"
^model
\\
.layers
\\
.(5[6-9]|60)
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:7"
prefill_device
:
"
cuda:7"
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
View file @
52fa671c
...
...
@@ -66,7 +66,7 @@
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
...
...
@@ -74,7 +74,7 @@
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
# mlp module with custom forward function
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
View file @
52fa671c
...
...
@@ -66,7 +66,7 @@
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
...
...
@@ -74,7 +74,7 @@
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
# mlp module with custom forward function
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
View file @
52fa671c
...
...
@@ -66,7 +66,7 @@
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
...
...
@@ -74,7 +74,7 @@
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
# mlp module with custom forward function
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
View file @
52fa671c
...
...
@@ -38,7 +38,7 @@
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
class
:
ktransformers.operators.gate.KMoEGate
DeepSeekV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
...
...
ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml
0 → 100644
View file @
52fa671c
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearCPUInfer"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearQ8"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
ktransformers/tests/score.py
0 → 100644
View file @
52fa671c
import
subprocess
import
time
import
requests
import
sys
import
os
def
wait_for_server
(
base_url
:
str
,
timeout
:
int
=
None
)
->
None
:
start_time
=
time
.
time
()
while
True
:
try
:
response
=
requests
.
get
(
f
"
{
base_url
}
/v1/models"
,
headers
=
{
"Authorization"
:
"Bearer None"
},
)
if
response
.
status_code
==
200
:
print
(
"Server is ready."
)
break
except
requests
.
exceptions
.
RequestException
:
time
.
sleep
(
1
)
if
timeout
and
time
.
time
()
-
start_time
>
timeout
:
raise
TimeoutError
(
"Server did not become ready within timeout period"
)
server_cmd
=
[
"numactl"
,
"-N"
,
"1"
,
"-m"
,
"1"
,
"/home/qujing3/anaconda3/envs/ktransformers-dev/bin/ktransformers"
,
"--model_path"
,
"/home/qujing3/models/DeepSeek-R1-Q4_K_M/config"
,
"--gguf_path"
,
"/home/qujing3/models/DeepSeek-V3-GGUF/DeepSeek-V3-Q4_K_M"
,
"--port"
,
"10002"
,
"--cpu_infer"
,
"48"
,
"--optimize_config_path"
,
"ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml"
,
"--max_new_tokens"
,
"3000"
,
"--cache_lens"
,
"6000"
]
print
(
"Starting ktransformers server..."
)
print
(
" "
.
join
(
server_cmd
))
with
open
(
"/tmp/server_log.txt"
,
"w"
)
as
f
:
server_process
=
subprocess
.
Popen
(
server_cmd
,
stdout
=
f
,
stderr
=
f
,
text
=
True
)
try
:
wait_for_server
(
"http://localhost:10002"
,
timeout
=
600
)
eval_cmd
=
[
"python"
,
"ktransformers/tests/humaneval/eval_api.py"
]
print
(
"Running eval_api.py..."
)
print
(
f
"Command:
{
' '
.
join
(
eval_cmd
)
}
"
)
env
=
os
.
environ
.
copy
()
env
[
"PYTHONUNBUFFERED"
]
=
"1"
eval_process
=
subprocess
.
Popen
(
eval_cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
text
=
True
,
bufsize
=
1
,
env
=
env
,
universal_newlines
=
True
)
import
threading
import
queue
def
enqueue_output
(
out
,
queue
):
for
line
in
iter
(
out
.
readline
,
''
):
queue
.
put
(
line
)
out
.
close
()
stdout_queue
=
queue
.
Queue
()
stderr_queue
=
queue
.
Queue
()
stdout_thread
=
threading
.
Thread
(
target
=
enqueue_output
,
args
=
(
eval_process
.
stdout
,
stdout_queue
))
stderr_thread
=
threading
.
Thread
(
target
=
enqueue_output
,
args
=
(
eval_process
.
stderr
,
stderr_queue
))
stdout_thread
.
daemon
=
True
stderr_thread
.
daemon
=
True
stdout_thread
.
start
()
stderr_thread
.
start
()
while
eval_process
.
poll
()
is
None
:
try
:
line
=
stdout_queue
.
get_nowait
()
print
(
line
,
end
=
''
,
flush
=
True
)
except
queue
.
Empty
:
pass
try
:
line
=
stderr_queue
.
get_nowait
()
print
(
line
,
end
=
''
,
file
=
sys
.
stderr
,
flush
=
True
)
except
queue
.
Empty
:
pass
time
.
sleep
(
1
)
while
not
stdout_queue
.
empty
():
print
(
stdout_queue
.
get
(),
end
=
''
,
flush
=
True
)
while
not
stderr_queue
.
empty
():
print
(
stderr_queue
.
get
(),
end
=
''
,
file
=
sys
.
stderr
,
flush
=
True
)
eval_process
.
wait
()
print
(
f
"eval_api.py completed with exit code:
{
eval_process
.
returncode
}
"
)
evaluate_cmd
=
[
"evaluate_functional_correctness"
,
"ktransformers/tests/humaneval/results/api/eval_b.jsonl"
]
print
(
"Running evaluate_functional_correctness..."
)
print
(
f
"Command:
{
' '
.
join
(
evaluate_cmd
)
}
"
)
evaluate_process
=
subprocess
.
Popen
(
evaluate_cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
text
=
True
,
bufsize
=
1
,
universal_newlines
=
True
)
for
line
in
evaluate_process
.
stdout
:
print
(
line
,
end
=
''
,
flush
=
True
)
for
line
in
evaluate_process
.
stderr
:
print
(
line
,
end
=
''
,
file
=
sys
.
stderr
,
flush
=
True
)
evaluate_process
.
wait
()
print
(
f
"evaluate_functional_correctness completed with exit code:
{
evaluate_process
.
returncode
}
"
)
if
evaluate_process
.
returncode
!=
0
:
print
(
f
"evaluate_functional_correctness exited with code
{
evaluate_process
.
returncode
}
"
)
sys
.
exit
(
evaluate_process
.
returncode
)
finally
:
print
(
"Stopping ktransformers server..."
)
server_process
.
terminate
()
try
:
server_process
.
wait
(
timeout
=
30
)
except
subprocess
.
TimeoutExpired
:
print
(
"Server did not terminate gracefully, forcing..."
)
server_process
.
kill
()
\ No newline at end of file
ktransformers/tests/test_pytorch_q8.py
0 → 100644
View file @
52fa671c
import
torch
# 定义一个包含线性层的浮点模型
class
LinearModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
in_features
,
out_features
):
super
().
__init__
()
self
.
linear
=
torch
.
nn
.
Linear
(
in_features
,
out_features
)
def
forward
(
self
,
x
):
return
self
.
linear
(
x
)
# 创建浮点模型实例
in_features
=
64
out_features
=
128
model_fp32
=
LinearModel
(
in_features
,
out_features
)
# 创建量化模型实例
model_int8
=
torch
.
ao
.
quantization
.
quantize_dynamic
(
model_fp32
,
# 原始浮点模型
{
torch
.
nn
.
Linear
},
# 要量化的层类型集合
dtype
=
torch
.
qint8
# 量化的目标数据类型
)
# 测试模型
batch_size
=
32
input_fp32
=
torch
.
randn
(
1
,
batch_size
,
in_features
)
# 生成随机输入数据
output_int8
=
model_int8
(
input_fp32
)
# 通过量化模型运行数据
# 打印输出形状验证
print
(
f
"输入形状:
{
input_fp32
.
shape
}
"
)
print
(
f
"输出形状:
{
output_int8
.
shape
}
"
)
# 比较原始模型和量化模型的输出
with
torch
.
no_grad
():
output_fp32
=
model_fp32
(
input_fp32
)
print
(
f
"FP32输出的前几个值:
{
output_fp32
[
0
,
:
5
]
}
"
)
print
(
f
"INT8输出的前几个值:
{
output_int8
[
0
,
:
5
]
}
"
)
# 计算平均误差
error
=
torch
.
abs
(
output_fp32
-
output_int8
).
mean
().
item
()
print
(
f
"平均绝对误差:
{
error
}
"
)
# 打印模型类型信息
print
(
f
"量化前模型类型:
{
type
(
model_fp32
.
linear
)
}
"
)
print
(
f
"量化后模型类型:
{
type
(
model_int8
.
linear
)
}
"
)
\ No newline at end of file
ktransformers/util/vendors.py
0 → 100644
View file @
52fa671c
from
__future__
import
annotations
from
enum
import
IntEnum
,
auto
from
typing
import
Optional
,
Union
,
List
import
torch
class
GPUVendor
(
IntEnum
):
NVIDIA
=
auto
()
AMD
=
auto
()
MooreThreads
=
auto
()
MetaX
=
auto
()
MUSA
=
auto
()
Unknown
=
auto
()
class
DeviceManager
:
"""
Device manager that provides a unified interface for handling different GPU vendors
"""
def
__init__
(
self
):
self
.
gpu_vendor
=
self
.
_detect_gpu_vendor
()
self
.
available_devices
=
self
.
_get_available_devices
()
def
_detect_gpu_vendor
(
self
)
->
GPUVendor
:
"""Detect GPU vendor type"""
if
not
torch
.
cuda
.
is_available
():
# Check MUSA availability (assuming a musa module exists)
try
:
import
musa
if
musa
.
is_available
():
return
GPUVendor
.
MUSA
except
(
ImportError
,
AttributeError
):
pass
return
GPUVendor
.
Unknown
device_name
=
torch
.
cuda
.
get_device_name
(
0
).
lower
()
if
any
(
name
in
device_name
for
name
in
[
"nvidia"
,
"geforce"
,
"quadro"
,
"tesla"
,
"titan"
,
"rtx"
,
"gtx"
]):
return
GPUVendor
.
NVIDIA
elif
any
(
name
in
device_name
for
name
in
[
"amd"
,
"radeon"
,
"rx"
,
"vega"
,
"instinct"
,
"firepro"
,
"mi"
]):
return
GPUVendor
.
AMD
elif
any
(
name
in
device_name
for
name
in
[
"mthreads"
,
"moore"
,
"mtt"
]):
return
GPUVendor
.
MooreThreads
elif
any
(
name
in
device_name
for
name
in
[
"metax"
,
"meta"
]):
return
GPUVendor
.
MetaX
elif
"musa"
in
device_name
:
return
GPUVendor
.
MUSA
# Backend check
try
:
if
hasattr
(
torch
.
version
,
'hip'
)
and
torch
.
version
.
hip
is
not
None
:
return
GPUVendor
.
AMD
elif
hasattr
(
torch
.
version
,
'cuda'
)
and
torch
.
version
.
cuda
is
not
None
:
return
GPUVendor
.
NVIDIA
except
:
pass
return
GPUVendor
.
Unknown
def
_get_available_devices
(
self
)
->
List
[
int
]:
"""Get list of available device indices"""
devices
=
[]
if
self
.
gpu_vendor
==
GPUVendor
.
NVIDIA
or
self
.
gpu_vendor
==
GPUVendor
.
AMD
:
devices
=
list
(
range
(
torch
.
cuda
.
device_count
()))
elif
self
.
gpu_vendor
==
GPUVendor
.
MUSA
:
try
:
import
musa
devices
=
list
(
range
(
musa
.
device_count
()))
except
(
ImportError
,
AttributeError
):
pass
return
devices
def
get_device_str
(
self
,
device_id
:
Union
[
int
,
str
])
->
str
:
"""
Get device string for the given device ID
Args:
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
Device string representation (e.g., "cuda:0", "musa:1", "cpu")
"""
if
device_id
==
-
1
or
device_id
==
"cpu"
:
return
"cpu"
if
isinstance
(
device_id
,
int
):
if
self
.
gpu_vendor
==
GPUVendor
.
NVIDIA
or
self
.
gpu_vendor
==
GPUVendor
.
AMD
:
if
device_id
<
torch
.
cuda
.
device_count
():
return
f
"cuda:
{
device_id
}
"
elif
self
.
gpu_vendor
==
GPUVendor
.
MUSA
:
try
:
import
musa
if
device_id
<
musa
.
device_count
():
return
f
"musa:
{
device_id
}
"
except
(
ImportError
,
AttributeError
):
pass
return
"cpu"
def
to_torch_device
(
self
,
device_id
:
Union
[
int
,
str
]
=
0
)
->
torch
.
device
:
"""
Convert device ID to torch.device object
Args:
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
torch.device object
"""
device_str
=
self
.
get_device_str
(
device_id
)
# Handle MUSA device
if
device_str
.
startswith
(
"musa:"
):
try
:
import
musa
index
=
int
(
device_str
.
split
(
":"
)[
-
1
])
return
musa
.
device
(
index
)
except
(
ImportError
,
ValueError
,
AttributeError
):
return
torch
.
device
(
"cpu"
)
# Standard PyTorch device
return
torch
.
device
(
device_str
)
def
move_tensor_to_device
(
self
,
tensor
:
torch
.
Tensor
,
device_id
:
Union
[
int
,
str
]
=
0
)
->
torch
.
Tensor
:
"""
Move tensor to specified device
Args:
tensor: PyTorch tensor to move
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
Tensor moved to the specified device
"""
device
=
self
.
to_torch_device
(
device_id
)
return
tensor
.
to
(
device
)
def
is_available
(
self
,
index
:
int
=
0
)
->
bool
:
"""
Check if device at specified index is available
Args:
index: Device index to check
Returns:
True if the device is available, False otherwise
"""
if
index
<
0
:
return
True
# CPU is always available
return
index
in
self
.
available_devices
def
get_all_devices
(
self
)
->
List
[
int
]:
"""
Get all available device indices
Returns:
List of available device indices (0, 1, 2, etc.)
"""
return
self
.
available_devices
# Create global device manager instance
device_manager
=
DeviceManager
()
# Convenience functions
def
get_device
(
device_id
:
Union
[
int
,
str
]
=
0
)
->
torch
.
device
:
"""
Get torch.device object for the specified device ID
Args:
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
torch.device object
"""
return
device_manager
.
to_torch_device
(
device_id
)
def
to_device
(
tensor
:
torch
.
Tensor
,
device_id
:
Union
[
int
,
str
]
=
0
)
->
torch
.
Tensor
:
"""
Move tensor to specified device
Args:
tensor: PyTorch tensor to move
device_id: Device index (0, 1, 2, etc.), -1 for CPU, or "cpu" string
Returns:
Tensor moved to the specified device
"""
return
device_manager
.
move_tensor_to_device
(
tensor
,
device_id
)
# Get devices
cpu_device
=
get_device
(
-
1
)
# CPU using index -1
cpu_device2
=
get_device
(
"cpu"
)
# CPU using string "cpu"
gpu0
=
get_device
(
0
)
# First GPU
# Move tensors
x
=
torch
.
randn
(
3
,
3
)
x_gpu
=
to_device
(
x
,
0
)
# Move to first GPU
x_cpu1
=
to_device
(
x
,
-
1
)
# Move to CPU using index -1
x_cpu2
=
to_device
(
x
,
"cpu"
)
# Move to CPU using string "cpu"
\ No newline at end of file
setup.py
View file @
52fa671c
...
...
@@ -29,7 +29,7 @@ import torch.version
from
wheel.bdist_wheel
import
bdist_wheel
as
_bdist_wheel
from
setuptools
import
setup
,
Extension
from
cpufeature.extension
import
CPUFeature
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
,
ROCM_HOME
try
:
from
torch_musa.utils.simple_porting
import
SimplePorting
from
torch_musa.utils.musa_extension
import
BuildExtension
,
MUSAExtension
,
MUSA_HOME
...
...
@@ -64,6 +64,70 @@ class VersionInfo:
musa_version
=
f
"
{
bare_metal_version
.
major
}{
bare_metal_version
.
minor
}
"
return
musa_version
def
get_rocm_bare_metal_version
(
self
,
rocm_dir
):
"""
Get the ROCm version from the ROCm installation directory.
Args:
rocm_dir: Path to the ROCm installation directory
Returns:
A string representation of the ROCm version (e.g., "63" for ROCm 6.3)
"""
try
:
# Try using rocm_agent_enumerator to get version info
raw_output
=
subprocess
.
check_output
(
[
rocm_dir
+
"/bin/rocminfo"
,
"--version"
],
universal_newlines
=
True
,
stderr
=
subprocess
.
STDOUT
)
# Extract version number from output
match
=
re
.
search
(
r
'(\d+\.\d+)'
,
raw_output
)
if
match
:
version_str
=
match
.
group
(
1
)
version
=
parse
(
version_str
)
rocm_version
=
f
"
{
version
.
major
}{
version
.
minor
}
"
return
rocm_version
except
(
subprocess
.
CalledProcessError
,
FileNotFoundError
):
# If rocminfo --version fails, try alternative methods
pass
try
:
# Try reading version from release file
with
open
(
os
.
path
.
join
(
rocm_dir
,
"share/doc/hip/version.txt"
),
"r"
)
as
f
:
version_str
=
f
.
read
().
strip
()
version
=
parse
(
version_str
)
rocm_version
=
f
"
{
version
.
major
}{
version
.
minor
}
"
return
rocm_version
except
(
FileNotFoundError
,
IOError
):
pass
# If all else fails, try to extract from directory name
dir_name
=
os
.
path
.
basename
(
os
.
path
.
normpath
(
rocm_dir
))
match
=
re
.
search
(
r
'rocm-(\d+\.\d+)'
,
dir_name
)
if
match
:
version_str
=
match
.
group
(
1
)
version
=
parse
(
version_str
)
rocm_version
=
f
"
{
version
.
major
}{
version
.
minor
}
"
return
rocm_version
# Fallback to extracting from hipcc version
try
:
raw_output
=
subprocess
.
check_output
(
[
rocm_dir
+
"/bin/hipcc"
,
"--version"
],
universal_newlines
=
True
,
stderr
=
subprocess
.
STDOUT
)
match
=
re
.
search
(
r
'HIP version: (\d+\.\d+)'
,
raw_output
)
if
match
:
version_str
=
match
.
group
(
1
)
version
=
parse
(
version_str
)
rocm_version
=
f
"
{
version
.
major
}{
version
.
minor
}
"
return
rocm_version
except
(
subprocess
.
CalledProcessError
,
FileNotFoundError
):
pass
# If we still can't determine the version, raise an error
raise
ValueError
(
f
"Could not determine ROCm version from directory:
{
rocm_dir
}
"
)
def
get_cuda_bare_metal_version
(
self
,
cuda_dir
):
raw_output
=
subprocess
.
check_output
(
[
cuda_dir
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
...
...
@@ -148,11 +212,13 @@ class VersionInfo:
cpu_instruct
=
self
.
get_cpu_instruct
()
backend_version
=
""
if
CUDA_HOME
is
not
None
:
backend_version
=
f
"
cu
{
self
.
get_cuda_bare_metal_version
(
CUDA_HOME
)
}
"
backend_version
=
f
""
elif
MUSA_HOME
is
not
None
:
backend_version
=
f
"mu
{
self
.
get_musa_bare_metal_version
(
MUSA_HOME
)
}
"
elif
ROCM_HOME
is
not
None
:
backend_version
=
f
"rocm
{
self
.
get_rocm_bare_metal_version
(
ROCM_HOME
)
}
"
else
:
raise
ValueError
(
"Unsupported backend: CUDA_HOME
and
MUSA_HOME
are
not set."
)
raise
ValueError
(
"Unsupported backend: CUDA_HOME MUSA_HOME
ROCM_HOME all
not set."
)
package_version
=
f
"
{
flash_version
}
+
{
backend_version
}
torch
{
torch_version
}{
cpu_instruct
}
"
if
full_version
:
return
package_version
...
...
@@ -247,9 +313,13 @@ class CMakeBuild(BuildExtension):
cmake_args
+=
[
"-DKTRANSFORMERS_USE_CUDA=ON"
]
elif
MUSA_HOME
is
not
None
:
cmake_args
+=
[
"-DKTRANSFORMERS_USE_MUSA=ON"
]
elif
ROCM_HOME
is
not
None
:
cmake_args
+=
[
"-DKTRANSFORMERS_USE_ROCM=ON"
]
else
:
raise
ValueError
(
"Unsupported backend: CUDA_HOME and MUSA_HOME are not set."
)
# log cmake_args
print
(
"CMake args:"
,
cmake_args
)
build_args
=
[]
if
"CMAKE_ARGS"
in
os
.
environ
:
cmake_args
+=
[
...
...
@@ -328,7 +398,7 @@ class CMakeBuild(BuildExtension):
[
"cmake"
,
"--build"
,
"."
,
"--verbose"
,
*
build_args
],
cwd
=
build_temp
,
check
=
True
)
if
CUDA_HOME
is
not
None
:
if
CUDA_HOME
is
not
None
or
ROCM_HOME
is
not
None
:
ops_module
=
CUDAExtension
(
'KTransformersOps'
,
[
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu'
,
'ktransformers/ktransformers_ext/cuda/binding.cpp'
,
...
...
@@ -338,7 +408,7 @@ if CUDA_HOME is not None:
'cxx'
:
[
'-O3'
,
'-DKTRANSFORMERS_USE_CUDA'
],
'nvcc'
:
[
'-O3'
,
'--use_fast_math'
,
#
'--use_fast_math',
'-Xcompiler'
,
'-fPIC'
,
'-DKTRANSFORMERS_USE_CUDA'
,
]
...
...
@@ -371,6 +441,7 @@ else:
raise
ValueError
(
"Unsupported backend: CUDA_HOME and MUSA_HOME are not set."
)
setup
(
name
=
VersionInfo
.
PACKAGE_NAME
,
version
=
VersionInfo
().
get_package_version
(),
cmdclass
=
{
"bdist_wheel"
:
BuildWheelsCommand
,
"build_ext"
:
CMakeBuild
},
ext_modules
=
[
...
...
third_party/llamafile/iqk_mul_mat.inc
View file @
52fa671c
...
...
@@ -2385,7 +2385,12 @@ struct SimpleBits {
__m256i
values
[
4
];
};
// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
#define HAVE_AVX512_POPCNT 1
#else
#define HAVE_AVX512_POPCNT 0
#endif
struct
EvenSignHelper
{
#if defined HAVE_FANCY_SIMD
...
...
@@ -2396,7 +2401,23 @@ struct EvenSignHelper {
};
IQK_ALWAYS_INLINE
void
sign_2_values
(
__m256i
aux
,
__m256i
*
values
)
const
{
aux
=
_mm256_and_si256
(
_mm256_srlv_epi32
(
aux
,
shifts
),
mask
);
auto
pcnt
=
_mm256_popcnt_epi32
(
aux
);
// fix for #829: 兼容Intel Cascade Lake架构的CPU,如果不支持AVX512VPOPCNTDQ扩展,则使用替代实现
#if HAVE_AVX512_POPCNT
auto
pcnt
=
_mm256_popcnt_epi32
(
aux
);
#else
// 提供替代实现,使用标准的位计数方法
__m256i
pcnt
;
int
*
pcnt_ptr
=
reinterpret_cast
<
int
*>
(
&
pcnt
);
int
*
aux_ptr
=
reinterpret_cast
<
int
*>
(
&
aux
);
// 直接获取 aux 的地址,避免不必要的复制
#pragma unroll 8 // 提示编译器展开循环,提高 SIMD 计算吞吐量
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
pcnt_ptr
[
i
]
=
__builtin_popcount
(
aux_ptr
[
i
]);
// 使用编译器内置 popcount
}
#endif
sbits_t
sbits
;
sbits
.
vec
=
_mm256_cvtepi32_epi8
(
_mm256_or_si256
(
aux
,
_mm256_slli_epi32
(
_mm256_and_si256
(
pcnt
,
mone
),
7
)));
values
[
0
]
=
_mm256_mask_sub_epi8
(
values
[
0
],
sbits
.
mask
[
0
],
_mm256_setzero_si256
(),
values
[
0
]);
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment