[2/N][torch.compile] make compilation cfg part of vllm cfg (#10383)

Signed-off-by: youkaichao <youkaichao@gmail.com>

[2/N][torch.compile] make compilation cfg part of vllm cfg (#10383)
Signed-off-by: youkaichao <youkaichao@gmail.com>
4fd93750 · youkaichao · GitHub · 661a34fd · 4fd93750 · 4fd93750
Unverified Commit 4fd93750 authored Nov 16, 2024 by youkaichao Committed by GitHub Nov 16, 2024
7 changed files
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -42,6 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import (
    safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.plugins import set_current_vllm_config
 from vllm.utils import is_pin_memory_available
@@ -97,7 +98,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
    all_params = [param.name for param in signatures.parameters.values()]
    if "vllm_config" in all_params and "prefix" in all_params:
        # new-style model class
-        return model_class(vllm_config=vllm_config, prefix=prefix)
+        with set_current_vllm_config(vllm_config):
+            return model_class(vllm_config=vllm_config, prefix=prefix)
    msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
           "input arguments. Possibly you have an old-style model class"
           " registered from out of tree and it is used for new vLLM version. "
@@ -121,7 +123,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
        kwargs["lora_config"] = vllm_config.lora_config
    if "scheduler_config" in all_params:
        kwargs["scheduler_config"] = vllm_config.scheduler_config
-    return model_class(**kwargs)
+    with set_current_vllm_config(vllm_config):
+        return model_class(**kwargs)
 class BaseModelLoader(ABC):

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
 import enum
 import random
-from typing import NamedTuple, Optional, Tuple, Union
+from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 import numpy as np
 import torch
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
 class PlatformEnum(enum.Enum):
    CUDA = enum.auto()
@@ -129,6 +134,19 @@ class Platform:
        np.random.seed(seed)
        torch.manual_seed(seed)
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        """
+        Check and update the configuration for the current platform.
+        It can raise an exception if the configuration is not compatible with
+        the current platform, or it can update the configuration to make it
+        compatible with the current platform.
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
 class UnspecifiedPlatform(Platform):
    _enum = PlatformEnum.UNSPECIFIED
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
 import os
+from typing import TYPE_CHECKING
 import torch
-import vllm.envs as envs
-from vllm.compilation.levels import CompilationLevel
 from vllm.plugins import set_torch_compile_backend
 from .interface import Platform, PlatformEnum
-if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
+if TYPE_CHECKING:
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
+    from vllm.config import VllmConfig
+else:
-assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\
+    VllmConfig = None
-     "TPU does not support Inductor."
 set_torch_compile_backend("openxla")
@@ -31,3 +29,12 @@ class TpuPlatform(Platform):
    @classmethod
    def inference_mode(cls):
        return torch.no_grad()
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.config import CompilationLevel
+        compilation_config = vllm_config.compilation_config
+        if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+        assert compilation_config.level < CompilationLevel.PIECEWISE,\
+            "TPU does not support Inductor."
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
 import logging
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable, Optional, Union
 import vllm.envs as envs
 if TYPE_CHECKING:
-    from vllm.compilation.config import CompilationConfig
+    from vllm.config import CompilationConfig, VllmConfig
-    from vllm.config import VllmConfig
 else:
    CompilationConfig = None
    VllmConfig = None
@@ -72,3 +72,29 @@ def set_compilation_config(config: Optional[CompilationConfig]):
 def get_compilation_config() -> Optional[CompilationConfig]:
    return _compilation_config
+_current_vllm_config: Optional[VllmConfig] = None
+@contextmanager
+def set_current_vllm_config(vllm_config: VllmConfig):
+    """
+    Temporarily set the current VLLM config.
+    Used during model initialization.
+    We save the current VLLM config in a global variable,
+    so that all modules can access it, e.g. custom ops
+    can access the VLLM config to determine how to dispatch.
+    """
+    global _current_vllm_config
+    old_vllm_config = _current_vllm_config
+    try:
+        _current_vllm_config = vllm_config
+        yield
+    finally:
+        _current_vllm_config = old_vllm_config
+def get_current_vllm_config() -> VllmConfig:
+    assert _current_vllm_config is not None, "Current VLLM config is not set."
+    return _current_vllm_config
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
-import os
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
@@ -8,11 +7,8 @@ import torch
 import torch.distributed
 import torch.nn as nn
-from vllm import envs
 from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.config import CompilationConfig
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -99,7 +95,7 @@ class GPUModelRunner:
            pin_memory=self.pin_memory,
        )
-        self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL
+        self.use_cuda_graph = (self.vllm_config.compilation_config.level
                               == CompilationLevel.PIECEWISE
                               and not self.model_config.enforce_eager)
        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
@@ -517,9 +513,9 @@ class GPUModelRunner:
            # CUDA graphs do not work properly with the custom CUDA kernels.
            # FIXME(woosuk): Disable inductor to reduce the compilation time
            # and avoid any potential issues with the inductor.
-            os.environ["VLLM_CUSTOM_OPS"] = "none"
            set_compilation_config(
                CompilationConfig(
+                    custom_ops=["none"],
                    use_cudagraph=True,
                    non_cudagraph_ops=["vllm.unified_v1_flash_attention"],
                    use_inductor=True,

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -19,8 +19,7 @@ from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel, VllmConfig
-from vllm.config import VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
@@ -1142,8 +1141,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                    "provided. Defaulting to scaling factors of 1.0. "
                    "This may lead to less accurate results!")
-        if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \
+        if self.vllm_config.compilation_config.level ==\
-            and supports_dynamo():
+            CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
            from vllm.plugins import get_torch_compile_backend
            backend = get_torch_compile_backend() or "eager"
            self.model = torch.compile(

--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -140,7 +140,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
            model = get_model(vllm_config=self.vllm_config)
        model = model.eval()
        xm.wait_device_ops()
-        self.model = ModelWrapper(model)
+        self.model = ModelWrapper(model, self.vllm_config)
    def _dummy_run(
        self,
@@ -669,13 +669,15 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
 class ModelWrapper(TorchCompileWrapperWithCustomDispatcher):
-    def __init__(self, model: nn.Module):
+    def __init__(self, model: nn.Module, vllm_config: VllmConfig):
        self.model = model
        compiled_callable = torch.compile(self.forward,
                                          backend="openxla",
                                          fullgraph=True,
                                          dynamic=False)
-        super().__init__(compiled_callable)
+        super().__init__(
+            compiled_callable,
+            compilation_level=vllm_config.compilation_config.level)
    def __call__(self, *args, is_prompt: bool, **kwargs):
        if len(self.compiled_codes) < 3 or not self.use_custom_dispatcher: