[Ascend]: Fixed the issue where OOT Platform vllm-ascend could not enable SP in Eager mode (#28935)

Signed-off-by: leo-pony <nengjunma@outlook.com>

[Ascend]: Fixed the issue where OOT Platform vllm-ascend could not enable SP in Eager mode (#28935)
Signed-off-by: leo-pony <nengjunma@outlook.com>
eaf81485 · Nengjun Ma · GitHub · 38caf7fa · eaf81485 · eaf81485
Unverified Commit eaf81485 authored Dec 02, 2025 by Nengjun Ma Committed by GitHub Dec 01, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 2 deletions

vllm/config/compilation.py vllm/config/compilation.py +7 -0

vllm/config/vllm.py vllm/config/vllm.py +8 -2

No files found.
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -855,6 +855,13 @@ class CompilationConfig:
        self.compute_bs_to_padded_graph_size()
    def set_splitting_ops_for_v1(self):
+        # To compatible with OOT hardware plugin platform (for example vllm-ascend)
+        # which currently only supports sequence parallelism in eager mode.
+        if self.mode != CompilationMode.VLLM_COMPILE:
+            if self.splitting_ops is None:
+                self.splitting_ops = []
+            return
        # NOTE: this function needs to be called only when mode is
        # CompilationMode.VLLM_COMPILE
        assert self.mode == CompilationMode.VLLM_COMPILE, (

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -797,8 +797,7 @@ class VllmConfig:
        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
        # Do this after all the updates to compilation_config.mode
-        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+        self.compilation_config.set_splitting_ops_for_v1()
-            self.compilation_config.set_splitting_ops_for_v1()
        if self.compilation_config.pass_config.enable_sequence_parallelism:
            # With pipeline parallelism or dynamo partitioning,
@@ -806,6 +805,13 @@ class VllmConfig:
            # Use custom rms norm to unblock. In the future,
            # the pass will operate on higher-level IR to avoid the issue.
            # TODO: https://github.com/vllm-project/vllm/issues/27894
+            if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
+                logger.warning(
+                    "Sequence parallelism is enabled, but running in wrong "
+                    "vllm compile mode: %s.",
+                    self.compilation_config.mode,
+                )
            is_fullgraph = (
                self.compilation_config.use_inductor_graph_partition
                or len(self.compilation_config.splitting_ops) == 0