Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
27990dc6
Commit
27990dc6
authored
Apr 28, 2025
by
qiyuxinlin
Browse files
fix load bug
parent
74bb7fdc
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
2 deletions
+4
-2
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+2
-1
ktransformers/server/balance_serve/inference/model_runner.py
ktransformers/server/balance_serve/inference/model_runner.py
+1
-1
requirements-local_chat.txt
requirements-local_chat.txt
+1
-0
No files found.
ktransformers/operators/experts.py
View file @
27990dc6
...
@@ -25,7 +25,6 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
...
@@ -25,7 +25,6 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
,
"Debug"
))
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
,
"Debug"
))
import
cpuinfer_ext
import
cpuinfer_ext
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
from
cpuinfer_ext.moe
import
AMX_MOEConfig
,
AMXBF16_MOE
,
AMXInt8_MOE
import
ctypes
import
ctypes
from
ktransformers.util.custom_gguf
import
GGMLQuantizationType
,
GGUFLoader
from
ktransformers.util.custom_gguf
import
GGMLQuantizationType
,
GGUFLoader
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.util.utils
import
InferenceState
...
@@ -186,6 +185,7 @@ class KExpertsCPU(KExpertsBase):
...
@@ -186,6 +185,7 @@ class KExpertsCPU(KExpertsBase):
)
)
self
.
moe
=
MOE
(
moe_config
)
self
.
moe
=
MOE
(
moe_config
)
elif
self
.
backend
==
"AMXBF16"
:
elif
self
.
backend
==
"AMXBF16"
:
from
cpuinfer_ext.moe
import
AMX_MOEConfig
,
AMXBF16_MOE
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
...
@@ -203,6 +203,7 @@ class KExpertsCPU(KExpertsBase):
...
@@ -203,6 +203,7 @@ class KExpertsCPU(KExpertsBase):
self
.
cpu_infer
.
submit
(
self
.
moe
.
load_weights
())
self
.
cpu_infer
.
submit
(
self
.
moe
.
load_weights
())
self
.
cpu_infer
.
sync
()
self
.
cpu_infer
.
sync
()
elif
self
.
backend
==
"AMXInt8"
:
elif
self
.
backend
==
"AMXInt8"
:
from
cpuinfer_ext.moe
import
AMX_MOEConfig
,
AMXInt8_MOE
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
...
...
ktransformers/server/balance_serve/inference/model_runner.py
View file @
27990dc6
...
@@ -85,7 +85,7 @@ class ModelRunner:
...
@@ -85,7 +85,7 @@ class ModelRunner:
elif
isinstance
(
self
.
model
,
KQwen2MoeForCausalLM
)
or
isinstance
(
self
.
model
,
KQwen3MoeForCausalLM
):
elif
isinstance
(
self
.
model
,
KQwen2MoeForCausalLM
)
or
isinstance
(
self
.
model
,
KQwen3MoeForCausalLM
):
self
.
model
.
flash_infer_attn_plan
(
batch
,
self
.
bsz_tensor_buf
,
self
.
num_tokens_tensor_buf
,
self
.
model
.
flash_infer_attn_plan
(
batch
,
self
.
bsz_tensor_buf
,
self
.
num_tokens_tensor_buf
,
num_q_heads
=
self
.
model
.
config
.
num_attention_heads
,
num_kv_heads
=
self
.
model
.
config
.
num_key_value_heads
,
num_q_heads
=
self
.
model
.
config
.
num_attention_heads
,
num_kv_heads
=
self
.
model
.
config
.
num_key_value_heads
,
head_dim
=
self
.
model
.
config
.
head_dim
if
hasattr
(
self
.
model
.
config
,
'head_
nu
m'
)
else
self
.
model
.
config
.
hidden_size
//
self
.
model
.
config
.
num_attention_heads
,
head_dim
=
self
.
model
.
config
.
head_dim
if
hasattr
(
self
.
model
.
config
,
'head_
di
m'
)
else
self
.
model
.
config
.
hidden_size
//
self
.
model
.
config
.
num_attention_heads
,
page_size
=
self
.
model
.
cache
.
page_size
,
causal
=
True
,
page_size
=
self
.
model
.
cache
.
page_size
,
causal
=
True
,
q_data_type
=
torch
.
bfloat16
,
kv_data_type
=
torch
.
bfloat16
,
cuda_graph_idx
=
cuda_graph_idx
)
q_data_type
=
torch
.
bfloat16
,
kv_data_type
=
torch
.
bfloat16
,
cuda_graph_idx
=
cuda_graph_idx
)
else
:
else
:
...
...
requirements-local_chat.txt
View file @
27990dc6
...
@@ -7,3 +7,4 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
...
@@ -7,3 +7,4 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
protobuf
protobuf
tiktoken
tiktoken
blobfile
blobfile
triton==3.3
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment