Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
3efb6621
Unverified
Commit
3efb6621
authored
Apr 17, 2025
by
wang jiahao
Committed by
GitHub
Apr 17, 2025
Browse files
Merge pull request #1157 from jiangshibiao/dev-fix-bug
Add bsz_tensors param to torch linear
parents
d2cf8142
92106166
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
7 deletions
+7
-7
ktransformers/local_chat_test.py
ktransformers/local_chat_test.py
+2
-2
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+3
-3
ktransformers/operators/linear.py
ktransformers/operators/linear.py
+2
-2
No files found.
ktransformers/local_chat_test.py
View file @
3efb6621
...
...
@@ -158,12 +158,12 @@ def local_chat(
if
system
!=
"Windows"
and
(
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
config
.
architectures
[
0
]
==
"DeepseekV3ForCausalLM"
)
and
flashinfer_enabled
and
get_compute_capability
()
>=
8
:
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_
prefill_
size
=
chunk_prefill_size
,
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_size
=
chunk_prefill_size
,
use_flashinfer_mla
=
True
,
num_heads
=
config
.
num_attention_heads
,
head_dim_ckv
=
config
.
kv_lora_rank
,
head_dim_kpe
=
config
.
qk_rope_head_dim
,
q_head_dim
=
config
.
qk_rope_head_dim
+
config
.
qk_nope_head_dim
)
else
:
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_
prefill_
size
=
chunk_prefill_size
,
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_size
=
chunk_prefill_size
,
)
break
...
...
ktransformers/operators/experts.py
View file @
3efb6621
...
...
@@ -680,9 +680,9 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
y
.
resize_
(
*
orig_shape
)
return
y
,
router_logits
hidden_states_expert
=
hidden_states
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
hidden_states
_expert
.
cpu
()
selected_experts_expert
=
selected_experts
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
selected_experts
_expert
.
cpu
()
routing_weights_expert
=
routing_weights
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
routing_weights
_expert
.
cpu
()
hidden_states_expert
=
hidden_states
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
hidden_states
.
cpu
()
selected_experts_expert
=
selected_experts
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
selected_experts
.
cpu
()
routing_weights_expert
=
routing_weights
.
to
(
self
.
experts
.
device
)
if
isinstance
(
self
.
experts
,
KExpertsBase
)
else
routing_weights
.
cpu
()
shared_expert_output
=
self
.
shared_expert
(
hidden_states
)
shared_expert_output
=
(
...
...
ktransformers/operators/linear.py
View file @
3efb6621
...
...
@@ -138,7 +138,7 @@ class KLinearTorch(KLinearBase):
self
.
weight
=
None
self
.
has_bias
=
False
def
forward
(
self
,
x
:
torch
.
Tensor
,
**
kwargs
)
->
torch
.
Tensor
:
def
forward
(
self
,
x
:
torch
.
Tensor
,
bsz_tensor
:
torch
.
Tensor
=
None
,
**
kwargs
)
->
torch
.
Tensor
:
dtype
=
x
.
dtype
out_device
=
x
.
device
# TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
...
...
@@ -201,7 +201,7 @@ class KLinearQ8(KLinearBase):
self
.
bias
=
None
self
.
loaded
=
False
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
x
:
torch
.
Tensor
,
bsz_tensor
:
torch
.
Tensor
=
None
)
->
torch
.
Tensor
:
orig_dtype
=
x
.
dtype
out_device
=
x
.
device
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment