[torch.compile] integration with compilation control (#9058)

e4d652ea · youkaichao · GitHub · 78c0b416 · e4d652ea · e4d652ea
Unverified Commit e4d652ea authored Oct 10, 2024 by youkaichao Committed by GitHub Oct 10, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 8 deletions

vllm/sequence.py vllm/sequence.py +3 -4

vllm/worker/model_runner.py vllm/worker/model_runner.py +14 -4

No files found.
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1137,10 +1137,9 @@ class EmbeddingSequenceGroupOutput(
        return self.embeddings == other.embeddings


-class IntermediateTensors(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
+# cannot use msgspec.Struct here because Dynamo does not support it
+@dataclass
+class IntermediateTensors:
    """For all pipeline stages except the last, we need to return the hidden
    states and residuals to be sent to the next stage. This data structure
    contains the hidden states and residuals for a request.

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -18,6 +18,8 @@ import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.levels import CompilationLevel
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         ModelConfig, ObservabilityConfig, ParallelConfig,
                         PromptAdapterConfig, SchedulerConfig)
@@ -1126,10 +1128,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                    "provided. Defaulting to scaling factors of 1.0. "
                    "This may lead to less accurate results!")

-        if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo():
-            from vllm.compilation.backends import vllm_backend
+        if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \
+            and supports_dynamo():
            from vllm.plugins import get_torch_compile_backend
-            backend = get_torch_compile_backend() or vllm_backend
+            backend = get_torch_compile_backend() or "eager"
            self.model = torch.compile(
                self.model,
                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
@@ -1289,7 +1291,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                batch_size=batch_size,
                dtype=self.model_config.dtype,
                device=self.device)
-        self.execute_model(model_input, kv_caches, intermediate_tensors)
+
+        graph_batch_size = self.max_batchsize_to_capture
+        batch_size_capture_list = [
+            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
+        ]
+        if self.model_config.enforce_eager:
+            batch_size_capture_list = []
+        with set_compile_context(batch_size_capture_list):
+            self.execute_model(model_input, kv_caches, intermediate_tensors)
        torch.cuda.synchronize()
        return