Unverified Commit eaf81485 authored by Nengjun Ma's avatar Nengjun Ma Committed by GitHub
Browse files

[Ascend]: Fixed the issue where OOT Platform vllm-ascend could not enable SP in Eager mode (#28935)


Signed-off-by: default avatarleo-pony <nengjunma@outlook.com>
parent 38caf7fa
...@@ -855,6 +855,13 @@ class CompilationConfig: ...@@ -855,6 +855,13 @@ class CompilationConfig:
self.compute_bs_to_padded_graph_size() self.compute_bs_to_padded_graph_size()
def set_splitting_ops_for_v1(self): def set_splitting_ops_for_v1(self):
# To compatible with OOT hardware plugin platform (for example vllm-ascend)
# which currently only supports sequence parallelism in eager mode.
if self.mode != CompilationMode.VLLM_COMPILE:
if self.splitting_ops is None:
self.splitting_ops = []
return
# NOTE: this function needs to be called only when mode is # NOTE: this function needs to be called only when mode is
# CompilationMode.VLLM_COMPILE # CompilationMode.VLLM_COMPILE
assert self.mode == CompilationMode.VLLM_COMPILE, ( assert self.mode == CompilationMode.VLLM_COMPILE, (
......
...@@ -797,8 +797,7 @@ class VllmConfig: ...@@ -797,8 +797,7 @@ class VllmConfig:
), "MTP with cp_kv_cache_interleave_size > 1 is not supported now." ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
# Do this after all the updates to compilation_config.mode # Do this after all the updates to compilation_config.mode
if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: self.compilation_config.set_splitting_ops_for_v1()
self.compilation_config.set_splitting_ops_for_v1()
if self.compilation_config.pass_config.enable_sequence_parallelism: if self.compilation_config.pass_config.enable_sequence_parallelism:
# With pipeline parallelism or dynamo partitioning, # With pipeline parallelism or dynamo partitioning,
...@@ -806,6 +805,13 @@ class VllmConfig: ...@@ -806,6 +805,13 @@ class VllmConfig:
# Use custom rms norm to unblock. In the future, # Use custom rms norm to unblock. In the future,
# the pass will operate on higher-level IR to avoid the issue. # the pass will operate on higher-level IR to avoid the issue.
# TODO: https://github.com/vllm-project/vllm/issues/27894 # TODO: https://github.com/vllm-project/vllm/issues/27894
if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
logger.warning(
"Sequence parallelism is enabled, but running in wrong "
"vllm compile mode: %s.",
self.compilation_config.mode,
)
is_fullgraph = ( is_fullgraph = (
self.compilation_config.use_inductor_graph_partition self.compilation_config.use_inductor_graph_partition
or len(self.compilation_config.splitting_ops) == 0 or len(self.compilation_config.splitting_ops) == 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment