Merge remote-tracking branch 'origin/v0.15.1-dev' into v0.15.1-dev-pcp

20254503 · 王敏 · 1e9ff2e7 · 3842b316 · 20254503 · 20254503
Commit 20254503 authored Apr 13, 2026 by 王敏
10 changed files
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd_dp.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd_dp.py
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -19,7 +19,7 @@ grpcio-tools>=1.76.0
 numa
 # pytrie
-cmake==3.29
+cmake==3.29.2
 quart
 fastrlock==0.8.3
 # cupy==12.3.0

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -383,6 +383,24 @@ def fused_add_rms_norm_opt_fake(
 ) -> None:
    return None
+def silu_and_mul_opt_lightop(input: torch.Tensor) -> torch.Tensor:
+    return torch.ops.vllm.silu_and_mul_opt_lightop(input)
+def silu_and_mul_opt_lightop_impl(input: torch.Tensor) -> torch.Tensor:
+    d = input.shape[-1] // 2
+    output_shape = input.shape[:-1] + (d,)
+    out = torch.empty(output_shape, dtype=input.dtype, device=input.device)
+    op.silu_and_mul_opt(out, input)
+    return out
+def silu_and_mul_opt_lightop_fake(input: torch.Tensor) -> torch.Tensor:
+    d = input.shape[-1] // 2
+    output_shape = input.shape[:-1] + (d,)
+    return input.new_empty(output_shape)
 def fused_qk_norm_rope(
    qkv: torch.Tensor,
    num_heads_q: int,
@@ -3631,6 +3649,13 @@ direct_register_custom_op(
    fake_impl=fused_add_rms_norm_opt_fake,
 )
+direct_register_custom_op(
+    op_name="silu_and_mul_opt_lightop",
+    op_func=silu_and_mul_opt_lightop_impl,
+    mutates_args=[],
+    fake_impl=silu_and_mul_opt_lightop_fake,
+)
 """
 qwen3-vl-8b中LLM的修改 rms+mrope dim==1  2026/03/18 
 """
@@ -3749,4 +3774,4 @@ direct_register_custom_op(
    mutates_args=["query","key"],
    fake_impl=rms_mrope_fuse_fake,
 )
\ No newline at end of file
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+from vllm import envs
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Optional, cast
@@ -45,6 +46,7 @@ class KVConnectorFactory:
        config: "VllmConfig",
        role: KVConnectorRole,
        kv_cache_config: Optional["KVCacheConfig"] = None,
+        dp_rank: int = -1,
    ) -> KVConnectorBase:
        kv_transfer_config = config.kv_transfer_config
        if kv_transfer_config is None:
@@ -77,6 +79,8 @@ class KVConnectorFactory:
        if compat_sig:
            # Old signature: __init__(self, vllm_config, role)
            return connector_cls(config, role)
+        elif envs.VLLM_USE_DP_CONNECTOR:
+            return connector_cls(config, role, kv_cache_config, dp_rank)
        else:
            # New signature: __init__(self, vllm_config, role, kv_cache_config)
            return connector_cls(config, role, kv_cache_config)
@@ -160,6 +164,11 @@ KVConnectorFactory.register_connector(
    "vllm.distributed.kv_transfer.kv_connector.v1.du.du_swift_connector",
    "DuSwiftConnector")
+KVConnectorFactory.register_connector(
+    "DuSwiftConnectorDp",
+    "vllm.distributed.kv_transfer.kv_connector.v1.du.du_swift_connector_dp",
+    "DuSwiftConnectorDp")
 KVConnectorFactory.register_connector(
    "LMCacheConnectorV1",
    "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector",

--- a/vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_connector_dp.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_connector_dp.py
--- a/vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_engine_dp.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_engine_dp.py
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1843,6 +1843,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # vllm will use rmsquant fused op
    "USE_FUSED_RMS_QUANT":
        lambda: bool(int(os.getenv("USE_FUSED_RMS_QUANT", "0"))),
+    #vllm use dp connector
+    "VLLM_USE_DP_CONNECTOR":
+        lambda: bool(int(os.getenv("VLLM_USE_DP_CONNECTOR", "0"))),
    # vllm pd separation will be used async
    "VLLM_P2P_ASYNC":
    lambda: bool(int(os.getenv("VLLM_P2P_ASYNC", "0"))),

--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -150,13 +150,15 @@ class SiluAndMul(CustomOp):
        return F.silu(x[..., :d]) * x[..., d:]
    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        if envs.VLLM_USE_OPT_OP:
-            self.op_opt(out, x)
+            from vllm import _custom_ops as ops
+            return ops.silu_and_mul_opt_lightop(x)
        else:
-            self.op(out, x) 
+            d = x.shape[-1] // 2
+            output_shape = x.shape[:-1] + (d,)
+            out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+            self.op(out, x)
        return out
    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:

--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -121,7 +121,7 @@ class Scheduler(SchedulerInterface):
                config=self.vllm_config,
                role=KVConnectorRole.SCHEDULER,
                kv_cache_config=self.kv_cache_config,
-            )
+                dp_rank=self.parallel_config.data_parallel_rank)
            if self.log_stats:
                self.connector_prefix_cache_stats = PrefixCacheStats()
            kv_load_failure_policy = (
@@ -556,6 +556,12 @@ class Scheduler(SchedulerInterface):
                    + len(scheduled_running_reqs) >= max_batch_running):
                        break
                request = self.waiting.peek_request()
+                if self.connector and not self.connector.is_producer and \
+                request.request_id not in self.finished_recving_kv_req_ids and \
+                envs.VLLM_USE_DP_CONNECTOR:
+                    self.waiting.pop_request()
+                    skipped_waiting_requests.prepend_request(request)
+                    continue
                # KVTransfer: skip request if still waiting for remote kvs.
                if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
                    is_ready = self._update_waiting_for_remote_kv(request)

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -66,6 +66,7 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
 from vllm.v1.utils import compute_iteration_details
 from vllm.version import __version__ as VLLM_VERSION
+from vllm import envs
 logger = init_logger(__name__)
@@ -1155,6 +1156,11 @@ class EngineCoreProc(EngineCore):
                    # Push to input queue for core busy loop.
                    self.input_queue.put_nowait((request_type, request))
+                    if isinstance(request, tuple) and self.scheduler.connector is not None \
+                        and envs.VLLM_USE_DP_CONNECTOR:
+                        req, _ = request
+                        if request_type == EngineCoreRequestType.ADD:
+                            self.scheduler.connector.register_req(req.request_id)
    def process_output_sockets(
        self,