Unverified Commit 690fe019 authored by cascade's avatar cascade Committed by GitHub
Browse files

[Feature] support sequence parallelism using compilation pass (#16155)


Signed-off-by: default avatarcascade812 <cascade812@outlook.com>
Signed-off-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
parent ed7a29d9
......@@ -1027,7 +1027,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_scheduled_tokens)
else:
# Eager mode.
num_input_tokens = num_scheduled_tokens
# Pad tokens to multiple of tensor_parallel_size when
# enabled collective fusion for SP
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
if self.vllm_config.compilation_config.pass_config. \
enable_sequence_parallelism and tp_size > 1:
from vllm.utils import round_up
num_input_tokens = round_up(num_scheduled_tokens, tp_size)
else:
num_input_tokens = num_scheduled_tokens
attn_metadata.num_input_tokens = num_input_tokens
# _prepare_inputs may reorder the batch, so we must gather multi
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment