恢复误删代码

fc345b74 · wanghl6 · 153002ad · fc345b74
Commit fc345b74 authored Apr 23, 2026 by wanghl6
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 0 deletions

vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_v2.py +21 -0

No files found.
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -883,6 +883,17 @@ class Indexer(nn.Module):
                bias_k,
                eps
            )
+            
+            enable_lightly_cp = get_forward_context().enable_lightly_cp
+            if enable_lightly_cp:
+                k = tensor_model_parallel_all_gather(
+                    k.contiguous(), 0
+                )
+                gather_indexes_tensor = get_forward_context().gather_indexes_tensor
+                enable_lightly_cplb = get_forward_context().enable_lightly_cplb
+                if enable_lightly_cplb and gather_indexes_tensor is not None:
+                    k = torch.index_select(k, 0, gather_indexes_tensor)    
+        
            if current_platform.is_rocm() and torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] != "gfx938":
                q_fp8 = q 
                q_scale = None
@@ -908,6 +919,16 @@ class Indexer(nn.Module):
            # `k_pe` is [num_tokens, 1, rope_dim] (MQA).
            k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1)

+            enable_lightly_cp = get_forward_context().enable_lightly_cp
+            if enable_lightly_cp:
+                k = tensor_model_parallel_all_gather(
+                    k.contiguous(), 0
+                )
+                gather_indexes_tensor = get_forward_context().gather_indexes_tensor
+                enable_lightly_cplb = get_forward_context().enable_lightly_cplb
+                if enable_lightly_cplb and gather_indexes_tensor is not None:
+                    k = torch.index_select(k, 0, gather_indexes_tensor)    
+
            # we only quant q here since k quant is fused with cache insertion
            if not current_platform.is_rocm() or torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938":
                q = q.view(-1, self.head_dim)