Unverified Commit b706d898 authored by Cody Yu's avatar Cody Yu Committed by GitHub
Browse files

[Bugfix][V1][PP] Only warmup sampler at last PP rank (#14643)


Signed-off-by: default avatarCody Yu <hao.yu.cody@gmail.com>
parent 863d315c
...@@ -14,6 +14,7 @@ from vllm.device_allocator.cumem import CuMemAllocator ...@@ -14,6 +14,7 @@ from vllm.device_allocator.cumem import CuMemAllocator
from vllm.distributed import (ensure_model_parallel_initialized, from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment, init_distributed_environment,
set_custom_all_reduce) set_custom_all_reduce)
from vllm.distributed.parallel_state import get_pp_group
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.model_executor import set_random_seed from vllm.model_executor import set_random_seed
...@@ -219,20 +220,22 @@ class Worker(WorkerBase): ...@@ -219,20 +220,22 @@ class Worker(WorkerBase):
# fragmentation issue. # fragmentation issue.
# NOTE: This is called after `capture_model` on purpose to prevent # NOTE: This is called after `capture_model` on purpose to prevent
# memory buffers from being cleared by `torch.cuda.empty_cache`. # memory buffers from being cleared by `torch.cuda.empty_cache`.
try: if get_pp_group().is_last_rank:
max_num_reqs = min(self.scheduler_config.max_num_seqs, try:
self.scheduler_config.max_num_batched_tokens) max_num_reqs = min(
self.model_runner._dummy_sampler_run( self.scheduler_config.max_num_seqs,
hidden_states=self.model_runner._dummy_run( self.scheduler_config.max_num_batched_tokens)
num_tokens=max_num_reqs)) self.model_runner._dummy_sampler_run(
except RuntimeError as e: hidden_states=self.model_runner._dummy_run(
if 'out of memory' in str(e): num_tokens=max_num_reqs))
raise RuntimeError( except RuntimeError as e:
"CUDA out of memory occurred when warming up sampler. " if 'out of memory' in str(e):
"Please try lowering `gpu_memory_utilization` when " raise RuntimeError(
"initializing the engine.") from None "CUDA out of memory occurred when warming up sampler. "
else: "Please try lowering `gpu_memory_utilization` when "
raise e "initializing the engine.") from None
else:
raise e
# Reset the seed to ensure that the random state is not affected by # Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling. # the model initialization and profiling.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment