Unverified Commit 43fada53 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[Model Runner V2] Refactor `dummy_run` (#32533)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent 4a5299c9
...@@ -13,7 +13,6 @@ from vllm.config.compilation import CUDAGraphMode ...@@ -13,7 +13,6 @@ from vllm.config.compilation import CUDAGraphMode
from vllm.distributed.parallel_state import graph_capture, is_global_first_rank from vllm.distributed.parallel_state import graph_capture, is_global_first_rank
from vllm.forward_context import set_forward_context from vllm.forward_context import set_forward_context
from vllm.v1.attention.backend import AttentionMetadataBuilder from vllm.v1.attention.backend import AttentionMetadataBuilder
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.worker.gpu.attn_utils import build_attn_metadata from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
from vllm.v1.worker.gpu.block_table import BlockTables from vllm.v1.worker.gpu.block_table import BlockTables
...@@ -60,12 +59,12 @@ class CudaGraphManager: ...@@ -60,12 +59,12 @@ class CudaGraphManager:
def get_cudagraph_size( def get_cudagraph_size(
self, self,
scheduler_output: SchedulerOutput,
num_tokens_after_padding: int, num_tokens_after_padding: int,
num_tokens_per_request: Iterable[int],
) -> int | None: ) -> int | None:
return get_cudagraph_size( return get_cudagraph_size(
num_tokens_after_padding, num_tokens_after_padding,
scheduler_output.num_scheduled_tokens.values(), num_tokens_per_request,
self.cudagraph_sizes, self.cudagraph_sizes,
self.cudagraph_mode, self.cudagraph_mode,
) )
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc import gc
import time import time
from collections.abc import Iterable
from copy import deepcopy from copy import deepcopy
from typing import Any from typing import Any
...@@ -288,47 +289,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -288,47 +289,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
skip_attn: bool = True, skip_attn: bool = True,
**kwargs, **kwargs,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
# Create a dummy scheduler output.
num_reqs = min(num_tokens, self.max_num_reqs) num_reqs = min(num_tokens, self.max_num_reqs)
input_batch = InputBatch.make_dummy( num_tokens_per_request = [num_tokens // num_reqs] * num_reqs
num_reqs=num_reqs, num_tokens_per_request[-1] += num_tokens % num_reqs
num_tokens=num_tokens, assert sum(num_tokens_per_request) == num_tokens
input_buffers=self.input_buffers, num_scheduled_tokens = {
device=self.device, f"_dummy_req_{i}": num_tokens_per_request[i] for i in range(num_reqs)
}
dummy_scheduler_output = SchedulerOutput.make_empty()
dummy_scheduler_output.total_num_scheduled_tokens = num_tokens
dummy_scheduler_output.num_scheduled_tokens = num_scheduled_tokens
# Execute the model.
self.execute_model(
dummy_scheduler_output, dummy_run=True, skip_attn_for_dummy_run=skip_attn
) )
if self.uses_mrope: assert self.execute_model_state is not None
input_batch.mrope_positions = self.mrope_states.mrope_positions[ hidden_states, input_batch = self.execute_model_state
:, :num_tokens sample_hidden_states = hidden_states[input_batch.logits_indices]
]
if self.supports_mm_inputs:
input_batch.inputs_embeds = self.encoder_runner.inputs_embeds[:num_tokens]
if not skip_attn:
self.prepare_dummy_attn_metadata(input_batch)
dp_size = self.parallel_config.data_parallel_size
num_tokens_across_dp = make_num_tokens_across_dp(dp_size, num_tokens)
num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
positions = input_batch.positions
if self.uses_mrope:
positions = input_batch.mrope_positions
with (
self.maybe_dummy_run_with_lora(
self.lora_config,
input_batch.num_scheduled_tokens,
num_sampled_tokens,
),
set_forward_context(
input_batch.attn_metadata,
self.vllm_config,
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp,
),
):
hidden_states = self.model(
input_ids=input_batch.input_ids,
positions=positions,
inputs_embeds=input_batch.inputs_embeds,
)
sample_hidden_states = hidden_states[input_batch.logits_indices]
return hidden_states, sample_hidden_states return hidden_states, sample_hidden_states
@torch.inference_mode() @torch.inference_mode()
...@@ -893,9 +872,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -893,9 +872,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
def get_cudagraph_and_dp_padding( def get_cudagraph_and_dp_padding(
self, self,
scheduler_output: SchedulerOutput, total_num_scheduled_tokens: int,
num_tokens_per_request: Iterable[int],
) -> tuple[CUDAGraphMode, int, torch.Tensor | None]: ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]:
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
dp_size = self.parallel_config.data_parallel_size dp_size = self.parallel_config.data_parallel_size
if dp_size == 1: if dp_size == 1:
# No DP. Only consider CUDA graphs. # No DP. Only consider CUDA graphs.
...@@ -904,7 +883,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -904,7 +883,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return CUDAGraphMode.NONE, 0, None return CUDAGraphMode.NONE, 0, None
cudagraph_size = self.cudagraph_manager.get_cudagraph_size( cudagraph_size = self.cudagraph_manager.get_cudagraph_size(
scheduler_output, total_num_scheduled_tokens total_num_scheduled_tokens, num_tokens_per_request
) )
if cudagraph_size is not None: if cudagraph_size is not None:
# Use full CUDA graph. # Use full CUDA graph.
...@@ -919,7 +898,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -919,7 +898,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
cudagraph_size_before_dp: int | None = 0 cudagraph_size_before_dp: int | None = 0
else: else:
cudagraph_size_before_dp = self.cudagraph_manager.get_cudagraph_size( cudagraph_size_before_dp = self.cudagraph_manager.get_cudagraph_size(
scheduler_output, total_num_scheduled_tokens total_num_scheduled_tokens, num_tokens_per_request
) )
if cudagraph_size_before_dp is None: if cudagraph_size_before_dp is None:
cudagraph_size_before_dp = -1 cudagraph_size_before_dp = -1
...@@ -951,6 +930,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -951,6 +930,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
scheduler_output: SchedulerOutput, scheduler_output: SchedulerOutput,
intermediate_tensors: Any | None = None, intermediate_tensors: Any | None = None,
dummy_run: bool = False, dummy_run: bool = False,
skip_attn_for_dummy_run: bool = False,
) -> ModelRunnerOutput | None: ) -> ModelRunnerOutput | None:
assert intermediate_tensors is None assert intermediate_tensors is None
if not dummy_run: if not dummy_run:
...@@ -965,7 +945,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -965,7 +945,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return EMPTY_MODEL_RUNNER_OUTPUT return EMPTY_MODEL_RUNNER_OUTPUT
cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = ( cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = (
self.get_cudagraph_and_dp_padding(scheduler_output) self.get_cudagraph_and_dp_padding(
scheduler_output.total_num_scheduled_tokens,
scheduler_output.num_scheduled_tokens.values(),
)
) )
if num_tokens_after_padding == 0: if num_tokens_after_padding == 0:
# All DP ranks have zero tokens to run. # All DP ranks have zero tokens to run.
...@@ -999,7 +982,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -999,7 +982,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
: input_batch.num_tokens_after_padding : input_batch.num_tokens_after_padding
] ]
else: else:
# No actual tokens to run. A dummy run for DP. # No actual tokens to run. A dummy run for DP or memory profiling.
num_reqs = min(num_tokens_after_padding, self.max_num_reqs) num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
input_batch = InputBatch.make_dummy( input_batch = InputBatch.make_dummy(
num_reqs=num_reqs, num_reqs=num_reqs,
...@@ -1011,7 +994,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1011,7 +994,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
input_batch.mrope_positions = self.mrope_states.mrope_positions[ input_batch.mrope_positions = self.mrope_states.mrope_positions[
:, :num_tokens_after_padding :, :num_tokens_after_padding
] ]
self.prepare_dummy_attn_metadata(input_batch) if not skip_attn_for_dummy_run:
self.prepare_dummy_attn_metadata(input_batch)
# FIXME(woosuk): Fix warmup for LoRA.
# Run model. # Run model.
if cudagraph_mode == CUDAGraphMode.FULL: if cudagraph_mode == CUDAGraphMode.FULL:
......
...@@ -662,12 +662,7 @@ class Worker(WorkerBase): ...@@ -662,12 +662,7 @@ class Worker(WorkerBase):
self.profiler.stop() self.profiler.stop()
def execute_dummy_batch(self) -> None: def execute_dummy_batch(self) -> None:
if self.use_v2_model_runner: self.model_runner._dummy_run(1, uniform_decode=True)
self.model_runner.execute_model(
SchedulerOutput.make_empty(), dummy_run=True
)
else:
self.model_runner._dummy_run(1, uniform_decode=True)
def add_lora(self, lora_request: LoRARequest) -> bool: def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_runner.add_lora(lora_request) return self.model_runner.add_lora(lora_request)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment