Merge branch 'v0.9.2-dev' into v0.9.2-dev-ds

dbb27c0e · zhuwenwen · 7904da3f · 3320343d · dbb27c0e · dbb27c0e
Commit dbb27c0e authored Sep 23, 2025 by zhuwenwen
6 changed files
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -289,9 +289,10 @@ class P2pNcclConnector(KVConnectorBase_V1):
                                             kv_cache, remote_address)

    def wait_for_save(self):
-        if self.is_producer:
-            assert self.p2p_nccl_engine is not None
-            self.p2p_nccl_engine.wait_for_sent()
+        pass
+        # if self.is_producer:
+        #     assert self.p2p_nccl_engine is not None
+        #     self.p2p_nccl_engine.wait_for_sent()

    def get_finished(
            self, finished_req_ids: set[str],

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -168,7 +168,8 @@ if TYPE_CHECKING:
    VLLM_USE_TRITON_CAT: bool = False
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
    USE_FUSED_RMS_QUANT: bool = False
-    VLLM_USE_DEEPSEEK_MOE_SUM_MUL_AND: bool = False
+    VLLM_USE_DEEPSEEK_MOE_SUM_MUL_ADD: bool = False
+    USE_FUSED_SILU_MUL_QUANT: bool = False

 def get_default_cache_root():
    return os.getenv(
@@ -1110,9 +1111,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "USE_FUSED_RMS_QUANT": 
    lambda: (os.getenv('USE_FUSED_RMS_QUANT', '0').lower() in
             ("true", "1")),
+
    # vllm will use lightop's moe_sum fusion operator for deepseek
-    "VLLM_USE_DEEPSEEK_MOE_SUM_MUL_AND": 
-    lambda: (os.getenv('VLLM_USE_DEEPSEEK_MOE_SUM_MUL_AND', 'True').lower() in
+    "VLLM_USE_DEEPSEEK_MOE_SUM_MUL_ADD": 
+    lambda: (os.getenv('VLLM_USE_DEEPSEEK_MOE_SUM_MUL_ADD', 'True').lower() in
+             ("true", "1")),
+
+    # vllm will use silu_mul_quant fused op 
+    "USE_FUSED_SILU_MUL_QUANT": 
+    lambda: (os.getenv('USE_FUSED_SILU_MUL_QUANT', '0').lower() in
             ("true", "1")),
 }


--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1760,7 +1760,7 @@ def fused_experts_impl(
                                block_shape=block_shape,
                                use_nn_moe=use_nn_moe)

-        if envs.VLLM_USE_DEEPSEEK_MOE_SUM_MUL_AND:
+        if envs.VLLM_USE_DEEPSEEK_MOE_SUM_MUL_ADD:
            if envs.VLLM_USE_LIGHT_OP and not dpsk_fp16_quick: 
                if shared_output is not None:
                    op.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -38,6 +38,12 @@ if envs.USE_FUSED_RMS_QUANT:
        from lmslim.quantize.quant_ops import lm_faster_rmsquant
    except Exception as e:
        print(f"Error: Import fused rmsquant error: {e}") 
+if envs.USE_FUSED_SILU_MUL_QUANT:        
+    try:
+        # from lightop import fuse_silu_mul_quant
+        from lmslim.quantize.quant_ops import lm_fuse_silu_mul_quant
+    except Exception as e:
+        print(f"Error: Import fused silu_mul_qunat error: {e}")
        
 logger = init_logger(__name__)

@@ -1488,7 +1494,8 @@ class RowParallelLinear(LinearBase):
        param.load_row_parallel_weight(loaded_weight=loaded_weight)

    def forward(
-        self, input_
+        self, input_,
+        use_fused_silu_mul_quant: Optional[bool] = False
    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
        if self.input_is_parallel:
            input_parallel = input_
@@ -1503,6 +1510,15 @@ class RowParallelLinear(LinearBase):
        # Only fuse bias add into GEMM for rank 0 (this ensures that
        # bias will not get added more than once in TP>1 case)
        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        if use_fused_silu_mul_quant:
+            xq, xs = lm_fuse_silu_mul_quant(input_parallel)
+            
+            silu_quant_args = [xq, xs]
+            output_parallel = self.quant_method.apply(self,
+                                                      input_parallel,
+                                                      bias=bias_,
+                                                      silu_quant_args=silu_quant_args)
+        else:
            output_parallel = self.quant_method.apply(self,
                                                      input_parallel,
                                                      bias=bias_)

--- a/vllm/model_executor/layers/quantization/slimquant_w4a8.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8.py
@@ -154,11 +154,14 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: Optional[torch.Tensor] = None,
-        input_quant_args: Optional[list[torch.Tensor]] = None  
+        input_quant_args: Optional[list[torch.Tensor]] = None,
+        silu_quant_args: Optional[list[torch.Tensor]] = None
    ):
        if envs.USE_FUSED_RMS_QUANT and input_quant_args is not None:
            assert len(input_quant_args) == 2
            x_q, x_scale = input_quant_args
+        elif envs.USE_FUSED_SILU_MUL_QUANT and silu_quant_args is not None:
+            x_q, x_scale = silu_quant_args
        else:
            x_q, x_scale = per_token_quant_int8(x)


--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -101,8 +101,12 @@ class DeepseekV2MLP(nn.Module):
                ):
        if envs.USE_FUSED_RMS_QUANT:
            gate_up, new_resi, _  = self.gate_up_proj(x, rms_weight, residual, update_hd=update_hd)
+            if envs.USE_FUSED_SILU_MUL_QUANT:
+                x, _ = self.down_proj(gate_up, use_fused_silu_mul_quant=True)
+            else:
                x = self.act_fn(gate_up)
                x, _ = self.down_proj(x)
+                
            return x, new_resi
        else:
            gate_up, _ = self.gate_up_proj(x)