Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
92106166
Commit
92106166
authored
Apr 17, 2025
by
root
Browse files
fix some bugs
parent
d2cf8142
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
7 deletions
+7
-7
ktransformers/local_chat_test.py
ktransformers/local_chat_test.py
+2
-2
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+3
-3
ktransformers/operators/linear.py
ktransformers/operators/linear.py
+2
-2
No files found.
ktransformers/local_chat_test.py
View file @
92106166
...
@@ -158,12 +158,12 @@ def local_chat(
...
@@ -158,12 +158,12 @@ def local_chat(
if
system
!=
"Windows"
and
(
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
config
.
architectures
[
0
]
==
"DeepseekV3ForCausalLM"
)
and
flashinfer_enabled
and
get_compute_capability
()
>=
8
:
if
system
!=
"Windows"
and
(
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
config
.
architectures
[
0
]
==
"DeepseekV3ForCausalLM"
)
and
flashinfer_enabled
and
get_compute_capability
()
>=
8
:
generated
=
prefill_and_generate
(
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_
prefill_
size
=
chunk_prefill_size
,
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_size
=
chunk_prefill_size
,
use_flashinfer_mla
=
True
,
num_heads
=
config
.
num_attention_heads
,
head_dim_ckv
=
config
.
kv_lora_rank
,
head_dim_kpe
=
config
.
qk_rope_head_dim
,
q_head_dim
=
config
.
qk_rope_head_dim
+
config
.
qk_nope_head_dim
use_flashinfer_mla
=
True
,
num_heads
=
config
.
num_attention_heads
,
head_dim_ckv
=
config
.
kv_lora_rank
,
head_dim_kpe
=
config
.
qk_rope_head_dim
,
q_head_dim
=
config
.
qk_rope_head_dim
+
config
.
qk_nope_head_dim
)
)
else
:
else
:
generated
=
prefill_and_generate
(
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_
prefill_
size
=
chunk_prefill_size
,
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_size
=
chunk_prefill_size
,
)
)
break
break
...
...
ktransformers/operators/experts.py
View file @
92106166
...
@@ -680,9 +680,9 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
...
@@ -680,9 +680,9 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
y
.
resize_
(
*
orig_shape
)
y
.
resize_
(
*
orig_shape
)
return
y
,
router_logits
return
y
,
router_logits
hidden_states_expert
=
hidden_states
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
hidden_states
_expert
.
cpu
()
hidden_states_expert
=
hidden_states
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
hidden_states
.
cpu
()
selected_experts_expert
=
selected_experts
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
selected_experts
_expert
.
cpu
()
selected_experts_expert
=
selected_experts
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
selected_experts
.
cpu
()
routing_weights_expert
=
routing_weights
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
routing_weights
_expert
.
cpu
()
routing_weights_expert
=
routing_weights
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
routing_weights
.
cpu
()
shared_expert_output
=
self
.
shared_expert
(
hidden_states
)
shared_expert_output
=
self
.
shared_expert
(
hidden_states
)
shared_expert_output
=
(
shared_expert_output
=
(
...
...
ktransformers/operators/linear.py
View file @
92106166
...
@@ -138,7 +138,7 @@ class KLinearTorch(KLinearBase):
...
@@ -138,7 +138,7 @@ class KLinearTorch(KLinearBase):
self
.
weight
=
None
self
.
weight
=
None
self
.
has_bias
=
False
self
.
has_bias
=
False
def
forward
(
self
,
x
:
torch
.
Tensor
,
**
kwargs
)
->
torch
.
Tensor
:
def
forward
(
self
,
x
:
torch
.
Tensor
,
bsz_tensor
:
torch
.
Tensor
=
None
,
**
kwargs
)
->
torch
.
Tensor
:
dtype
=
x
.
dtype
dtype
=
x
.
dtype
out_device
=
x
.
device
out_device
=
x
.
device
# TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
# TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
...
@@ -201,7 +201,7 @@ class KLinearQ8(KLinearBase):
...
@@ -201,7 +201,7 @@ class KLinearQ8(KLinearBase):
self
.
bias
=
None
self
.
bias
=
None
self
.
loaded
=
False
self
.
loaded
=
False
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
x
:
torch
.
Tensor
,
bsz_tensor
:
torch
.
Tensor
=
None
)
->
torch
.
Tensor
:
orig_dtype
=
x
.
dtype
orig_dtype
=
x
.
dtype
out_device
=
x
.
device
out_device
=
x
.
device
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment