Merge pull request #1157 from jiangshibiao/dev-fix-bug

Add bsz_tensors param to torch linear

Merge pull request #1157 from jiangshibiao/dev-fix-bug
Add bsz_tensors param to torch linear
3efb6621 · wang jiahao · GitHub · d2cf8142 · 92106166 · 3efb6621
Unverified Commit 3efb6621 authored Apr 17, 2025 by wang jiahao Committed by GitHub Apr 17, 2025
3 changed files
--- a/ktransformers/local_chat_test.py
+++ b/ktransformers/local_chat_test.py
@@ -158,12 +158,12 @@ def local_chat(
        
        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
            generated = prefill_and_generate(
-                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_prefill_size,
                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
            )
        else:
            generated = prefill_and_generate(
-                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_prefill_size,
            )
        break


--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -680,9 +680,9 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
            y.resize_(*orig_shape)
            return y, router_logits
        
-        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, KExpertsBase) else hidden_states_expert.cpu()
-        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts_expert.cpu()
-        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu()
+        hidden_states_expert = hidden_states.to(self.experts.device)  if isinstance(self.experts, KExpertsBase) else hidden_states.cpu()
+        selected_experts_expert = selected_experts.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else selected_experts.cpu()
+        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights.cpu()

        shared_expert_output = self.shared_expert(hidden_states)
        shared_expert_output = (

--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@@ -138,7 +138,7 @@ class KLinearTorch(KLinearBase):
        self.weight = None
        self.has_bias = False

-    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None, **kwargs) -> torch.Tensor:
        dtype = x.dtype
        out_device = x.device
        # TODO: support CUDA Graph when using cpu, but CPUInfer is recommended.
@@ -201,7 +201,7 @@ class KLinearQ8(KLinearBase):
        self.bias = None
        self.loaded = False
    
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, bsz_tensor: torch.Tensor=None) -> torch.Tensor:
        orig_dtype = x.dtype
        out_device = x.device