[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change ...

[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355) Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>

[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change ...
[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355) Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
96b9aa5a · Morrison Turnansky · GitHub · e66d787b · 96b9aa5a · 96b9aa5a
Unverified Commit 96b9aa5a authored Oct 14, 2025 by Morrison Turnansky Committed by GitHub Oct 15, 2025
20 changed files
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -299,7 +299,7 @@ def test_dict_args(parser):
        "val2",
        "--hf-overrides.key2.key4",
        "val3",
-        # Test compile config and compilation level
+        # Test compile config and compilation mode
        "-O.use_inductor=true",
        "-O.backend",
        "custom",
@@ -352,7 +352,7 @@ def test_dict_args(parser):
        },
    }
    assert parsed_args.compilation_config == {
-        "level": 1,
+        "mode": 1,
        "use_inductor": True,
        "backend": "custom",
        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
@@ -367,7 +367,7 @@ def test_duplicate_dict_args(caplog_vllm, parser):
        "--hf-overrides.key1",
        "val2",
        "-O1",
-        "-O.level",
+        "-O.mode",
        "2",
        "-O3",
    ]
@@ -375,12 +375,12 @@ def test_duplicate_dict_args(caplog_vllm, parser):
    parsed_args = parser.parse_args(args)
    # Should be the last value
    assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"level": 3}
+    assert parsed_args.compilation_config == {"mode": 3}

    assert len(caplog_vllm.records) == 1
    assert "duplicate" in caplog_vllm.text
    assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.level" in caplog_vllm.text
+    assert "-O.mode" in caplog_vllm.text


 @pytest.mark.parametrize(

--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -11,7 +11,7 @@ from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    CUDAGraphMode,
    ParallelConfig,
    SchedulerConfig,
@@ -42,7 +42,7 @@ def _create_vllm_config(
    mock_config.parallel_config = ParallelConfig()

    # Mimic the behavior of VllmConfig.__post_init__()
-    if compilation_config.level == CompilationLevel.PIECEWISE:
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE:
        compilation_config.set_splitting_ops_for_v1()

    return mock_config
@@ -50,23 +50,23 @@ def _create_vllm_config(

 class TestCudagraphDispatcher:
    @pytest.mark.parametrize(
-        "case_id,cudagraph_mode_str,compilation_level",
+        "case_id,cudagraph_mode_str,compilation_mode",
        [
            # Test case 0: Full CG for mixed batches, no separate routine
-            (0, "FULL", CompilationLevel.NO_COMPILATION),
+            (0, "FULL", CompilationMode.NONE),
            # Test case 1: Full CG for uniform batches, piecewise for mixed
-            (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION),
+            (1, "FULL_AND_PIECEWISE", CompilationMode.NONE),
            # Test case 2: Full CG for uniform batches, no CG for mixed
-            (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION),
-            # Test case 3: Piecewise for all
-            (3, "PIECEWISE", CompilationLevel.PIECEWISE),
+            (2, "FULL_DECODE_ONLY", CompilationMode.NONE),
+            # Test case 3: PIECEWISE for all
+            (3, "PIECEWISE", CompilationMode.VLLM_COMPILE),
        ],
    )
-    def test_dispatcher(self, cudagraph_mode_str, compilation_level):
+    def test_dispatcher(self, cudagraph_mode_str, compilation_mode):
        # Setup dispatcher
        comp_config = CompilationConfig(
            cudagraph_mode=cudagraph_mode_str,
-            level=compilation_level,
+            mode=compilation_mode,
            cudagraph_capture_sizes=[1, 8],
        )

@@ -242,7 +242,7 @@ class TestCudagraphIntegration:
    def setup_method(self):
        # only FULL mode for non-uniform batches
        self.comp_config = CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            cudagraph_mode="FULL",
            cudagraph_capture_sizes=[10, 20],
        )

--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -10,7 +10,7 @@ import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform


@@ -73,7 +73,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
            gpu_memory_utilization=0.45,
            max_model_len=1024,
            compilation_config=CompilationConfig(
-                level=3, cudagraph_mode=cudagraph_mode
+                mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
            ),
        )
        llm.generate(["Hello, my name is"] * 10)
@@ -90,32 +90,27 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
    )


-# test cudagraph_mode with different compilation level.
-# (backend_name, cudagraph_mode, compilation_level, supported)
+# test cudagraph_mode with different compilation mode.
+# (backend_name, cudagraph_mode, compilation_mode, supported)
 combo_cases_2 = [
-    ("FA2", "FULL", 0, True),  # no compilation + full cudagraph
-    ("FA2", "FULL", 3, True),  # piecewise compilation + full cudagraph
-    ("FA2", "PIECEWISE", 0, False),  # no compilation + piecewise cudagraph
-    ("FA2", "PIECEWISE", 3, True),  # piecewise compilation + piecewise cudagraph
-    (
-        "FA2",
-        "FULL_AND_PIECEWISE",
-        0,
-        False,
-    ),  # piecewise cudagraph not supported without piecewise compilation
-    ("FA2", "FULL_AND_PIECEWISE", 3, True),
-    ("FA2", "FULL_DECODE_ONLY", 0, True),
-    ("FA2", "FULL_DECODE_ONLY", 3, True),
-    ("FA2", "NONE", 0, True),  # no compilation + no cudagraph
-    ("FA2", "NONE", 3, True),  # piecewise compilation + no cudagraph
+    ("FA2", "FULL", CompilationMode.NONE, True),
+    ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+    ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+    ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "NONE", CompilationMode.NONE, True),
+    ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
 ]


 @pytest.mark.parametrize(
-    "backend_name,cudagraph_mode,compilation_level,supported", combo_cases_2
+    "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2
 )
 def test_cudagraph_compilation_combo(combo_case):
-    backend_name, cudagraph_mode, compilation_level, supported = combo_case
+    backend_name, cudagraph_mode, compilation_mode, supported = combo_case

    env_vars = backend_configs[backend_name].env_vars

@@ -130,7 +125,7 @@ def test_cudagraph_compilation_combo(combo_case):
            gpu_memory_utilization=0.45,
            max_model_len=1024,
            compilation_config=CompilationConfig(
-                level=compilation_level, cudagraph_mode=cudagraph_mode
+                mode=compilation_mode, cudagraph_mode=cudagraph_mode
            ),
        )
        llm.generate(["Hello, my name is"] * 10)

--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -7,7 +7,7 @@ import pytest
 import torch

 from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm.config import CompilationConfig, CompilationMode
 from vllm.distributed import cleanup_dist_env_and_memory

 from ...utils import fork_new_process_for_each_test
@@ -75,9 +75,9 @@ def test_kv_sharing_fast_prefill(
        # This allows vLLM compilation backend to handle allocating and
        # managing buffers for cudagraph
        cudagraph_copy_inputs=True,
-        level=CompilationLevel.PIECEWISE
+        mode=CompilationMode.VLLM_COMPILE
        if not enforce_eager
-        else CompilationLevel.NO_COMPILATION,
+        else CompilationMode.NONE,
    )

    with monkeypatch.context() as m:

--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -56,7 +56,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
            return InductorAdaptor()
    else:
        assert compilation_config.backend == "eager", (
-            "Custom backends not supported with CompilationLevel.PIECEWISE"
+            "Custom backends not supported with CompilationMode.VLLM_COMPILE"
        )

        logger.debug("Using EagerAdaptor")
@@ -481,7 +481,7 @@ def set_model_tag(tag: str):

 class VllmBackend:
    """The compilation backend for `torch.compile` with vLLM.
-    It is used for compilation level of `CompilationLevel.PIECEWISE`,
+    It is used for compilation mode of `CompilationMode.VLLM_COMPILE`,
    where we customize the compilation.

    The major work of this backend is to split the graph into

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -575,7 +575,7 @@ class InductorAdaptor(CompilerInterface):

        Because it is re-entrant, we always set it (even if entering via Dynamo
        and the context was already entered). We might want to revisit if it
-        should be set at a different level of compilation.
+        should be set at a different mode of compilation.

        This is likely a bug in PyTorch: public APIs should not rely on
        manually setting up internal contexts. But we also rely on non-public

--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -27,8 +27,8 @@ class CompilationCounter:
    num_cache_entries_updated: int = 0
    # The number of standalone_compile compiled artifacts saved
    num_compiled_artifacts_saved: int = 0
-    # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS
-    dynamo_as_is_count: int = 0
+    # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE
+    stock_torch_compile_count: int = 0

    def clone(self) -> "CompilationCounter":
        return copy.deepcopy(self)

--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -18,7 +18,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config
+from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils import resolve_obj_by_qualname, supports_dynamo
@@ -233,11 +233,11 @@ def _support_torch_compile(
        old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
        self.vllm_config = vllm_config
        enable_compile = enable_if is None or enable_if(vllm_config)
-        # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+        # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
        # will handle the compilation, so we don't need to do anything here.
        self.do_not_compile = (
-            vllm_config.compilation_config.level
-            in [CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS]
+            vllm_config.compilation_config.mode
+            in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
            or not supports_dynamo()
            or _should_ignore_torch_compile(self.__class__)
            or not enable_compile
@@ -247,7 +247,7 @@ def _support_torch_compile(

        compilation_counter.num_models_seen += 1
        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_level=vllm_config.compilation_config.level
+            self, compilation_mode=vllm_config.compilation_config.mode
        )

    cls.__init__ = __init__

--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -3,7 +3,7 @@

 import time

-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.config import CompilationConfig, CompilationMode, VllmConfig
 from vllm.logger import init_logger

 logger = init_logger(__name__)
@@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):

    compilation_config: CompilationConfig = vllm_config.compilation_config
    path = vllm_config.compile_debug_dump_path()
-    if compilation_config.level == CompilationLevel.PIECEWISE and path:
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE and path:
        import depyf

        path.mkdir(parents=True, exist_ok=True)
@@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):

 def end_monitoring_torch_compile(vllm_config: VllmConfig):
    compilation_config: CompilationConfig = vllm_config.compilation_config
-    if compilation_config.level == CompilationLevel.PIECEWISE:
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE:
        logger.info(
            "torch.compile takes %.2f s in total", compilation_config.compilation_time
        )

--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -11,7 +11,7 @@ from types import CodeType
 import torch

 import vllm.envs as envs
-from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config
+from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
 from vllm.logger import init_logger

 logger = init_logger(__name__)
@@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher:
    """

    def __init__(
-        self, compiled_callable: Callable | None = None, compilation_level: int = 0
+        self, compiled_callable: Callable | None = None, compilation_mode: int = 0
    ):
        vllm_config = get_current_vllm_config()
        self.vllm_config = vllm_config
@@ -72,7 +72,7 @@ class TorchCompileWrapperWithCustomDispatcher:
        # subclasses can use this to switch between the custom dispatcher
        # and the default Dynamo guard mechanism.
        self.use_custom_dispatcher: bool = (
-            compilation_level >= CompilationLevel.DYNAMO_ONCE
+            compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE
        )

    def aot_compile(self, *args, **kwargs):
@@ -85,7 +85,7 @@ class TorchCompileWrapperWithCustomDispatcher:
        return self.compiled_callable.aot_compile((args, kwargs))

    def __call__(self, *args, **kwargs):
-        """Implement the dispatch logic here, beyond the torch.compile level.
+        """Implement the dispatch logic here, beyond the torch.compile mode.
        NOTE: this function can have additional arguments beyond the forward
         method, for directly dispatching to the compiled code.
        """

--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -4,7 +4,7 @@
 from vllm.config.cache import CacheConfig
 from vllm.config.compilation import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    CUDAGraphMode,
    PassConfig,
 )
@@ -49,7 +49,7 @@ __all__ = [
    "CacheConfig",
    # From vllm.config.compilation
    "CompilationConfig",
-    "CompilationLevel",
+    "CompilationMode",
    "CUDAGraphMode",
    "PassConfig",
    # From vllm.config.device

--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -26,12 +26,20 @@ else:
 logger = init_logger(__name__)


-class CompilationLevel:
-    # constants for the levels of the compilation process
-    NO_COMPILATION = 0
-    DYNAMO_AS_IS = 1
-    DYNAMO_ONCE = 2
-    PIECEWISE = 3
+class CompilationMode:
+    """The compilation approach used for torch.compile-based compilation of the
+    model."""
+
+    NONE = 0
+    """No torch.compile compilation is applied, model runs in fully eager pytorch mode.
+    The model runs as-is."""
+    STOCK_TORCH_COMPILE = 1
+    """The standard `torch.compile` compilation pipeline."""
+    DYNAMO_TRACE_ONCE = 2
+    """Single Dynamo trace through the model, avoiding recompilation."""
+    VLLM_COMPILE = 3
+    """Custom vLLM Inductor-based backend with caching, piecewise compilation,
+    shape specialization, and custom passes."""


 class CUDAGraphMode(enum.Enum):
@@ -134,7 +142,7 @@ class CompilationConfig:
    """Configuration for compilation. It has three parts:

    - Top-level Compilation control:
-        - [`level`][vllm.config.CompilationConfig.level]
+        - [`mode`][vllm.config.CompilationConfig.mode]
        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
        - [`backend`][vllm.config.CompilationConfig.backend]
@@ -171,14 +179,26 @@ class CompilationConfig:

    # Top-level Compilation control
    level: int | None = None
-    """The level of compilation:
-
-    - None: If None, we will select the default compilation level.
-      For V1 engine this is 3, for V0 engine this is 0.
-    - 0: no compilation.
-    - 1: dynamo as is.
-    - 2: dynamo once.
-    - 3: piecewise compilation."""
+    """
+    Level is deprecated and will be removed in the next release,
+    either 0.12.0 or 0.11.2 whichever is soonest.
+    Please use mode. Currently all levels are mapped to mode.
+    """
+    # Top-level Compilation control
+    mode: int | None = None
+    """The compilation approach used for torch.compile-based compilation of the
+    model.
+
+    - None: If None, we will select the default compilation mode.
+      For V1 engine this is 3.
+    - 0: NONE: No torch.compile compilation is applied, model runs in fully
+         eager pytorch mode. The model runs as-is.
+    - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
+    - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding
+         recompilation by removing guards.
+         Requires no dynamic-shape-dependent control-flow.
+    - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching,
+         piecewise compilation, shape specialization, and custom passes."""
    debug_dump_path: Path | None = None
    """The path to dump the debug information."""
    cache_dir: str = ""
@@ -195,11 +215,11 @@ class CompilationConfig:

    backend function.
    We use string to avoid serialization issues when using compilation in a
-    distributed setting. When the compilation level is 1 or 2, the backend is
+    distributed setting. When the compilation mode is 1 or 2, the backend is
    used for the compilation directly (it sees the whole graph). When the
-    compilation level is 3, the backend is used for the piecewise compilation
+    compilation mode is 3, the backend is used for the piecewise compilation
    (it sees a part of the graph). The backend can not be custom for compilation
-    level 3, i.e. the backend must be either eager or inductor. Furthermore,
+    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
    compilation is only piecewise if splitting ops is set accordingly and
    use_inductor_graph_partition is off. Note that the default options for
    splitting ops are sufficient for piecewise compilation.
@@ -214,7 +234,7 @@ class CompilationConfig:
    - 'none,+op1,+op2' to enable only op1 and op2

    By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
    Inductor generates (fused) Triton kernels for disabled custom ops."""
    splitting_ops: list[str] | None = None
    """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -249,7 +269,7 @@ class CompilationConfig:
        One graph for symbolic shape and one graph per size in compile_sizes
        are compiled using configurations in inductor_compile_config.

-    This setting is ignored if level<PIECEWISE.
+    This setting is ignored if mode<VLLM_COMPILE.

    For future compatibility:
    If use_inductor is True, backend="inductor" otherwise backend="eager".
@@ -299,7 +319,7 @@ class CompilationConfig:
    Currently, the cudagraph mode is only used for the v1 engine.
    Note that the cudagraph logic is generally orthogonal to the 
    compilation logic. While piecewise cudagraphs require piecewise 
-    compilation (level=PIECEWISE and non-empty splitting_ops), full
+    compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
    cudagraphs are supported with and without compilation.
    
    Warning: This flag is new and subject to change in addition 
@@ -312,7 +332,7 @@ class CompilationConfig:
        that all input buffers have fixed addresses, and all
        splitting ops write their outputs to input buffers.
    In the vLLM V1 Engine, this flag only applies for
-    CompilationLevel.PIECEWISE (aka -O3).
+    CompilationMode.VLLM_COMPILE (aka -O3).
    Note that this is orthogonal to the cudagraph capture logic
    outside of compilation.
    Warning: This flag is deprecated and will be removed in the next major or
@@ -426,7 +446,7 @@ class CompilationConfig:
        the final hidden states.
        """
        factors: list[Any] = []
-        factors.append(self.level)
+        factors.append(self.mode)
        factors.append(self.backend)
        factors.append(self.custom_ops)
        factors.append(self.splitting_ops)
@@ -477,6 +497,17 @@ class CompilationConfig:
        return value

    def __post_init__(self) -> None:
+        if self.level is not None:
+            logger.warning(
+                "Level is deprecated and will be removed in the next release,"
+                "either 0.12.0 or 0.11.2 whichever is soonest."
+                "Use mode instead."
+                "If both level and mode are given,"
+                "only mode will be used."
+            )
+            if self.mode is None:
+                self.mode = self.level
+
        count_none = self.custom_ops.count("none")
        count_all = self.custom_ops.count("all")
        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
@@ -574,7 +605,7 @@ class CompilationConfig:
        # Currently only eager and inductor backend are supported.
        # for piecewise compilation. Custom backends are not suppported for
        # piecewise compilation. Update when more backends are supported.
-        if self.level == CompilationLevel.PIECEWISE and self.backend not in [
+        if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
            "",
            "eager",
            "inductor",
@@ -602,24 +633,27 @@ class CompilationConfig:
        Returns:
            The backend for the compilation config.
        """
-        if self.level is None:
+        if self.mode is None:
            raise ValueError(
-                "No compilation level is set. This method should only be \
+                "No compilation mode is set. This method should only be \
                called via vllm config where the level is set if none is \
                provided."
            )
-        if self.level == CompilationLevel.NO_COMPILATION:
-            raise ValueError("No compilation level is set.")
+        if self.mode == CompilationMode.NONE:
+            raise ValueError("No compilation mode is set.")

        from torch._dynamo.backends.registry import list_backends

        torch_backends = list_backends(exclude_tags=tuple())
-        if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
+        if self.mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+        ]:
            if self.backend in torch_backends:
                return self.backend
            return resolve_obj_by_qualname(self.backend)

-        assert self.level == CompilationLevel.PIECEWISE
+        assert self.mode == CompilationMode.VLLM_COMPILE
        if self.backend not in ["eager", "inductor"]:
            raise ValueError(
                f"Invalid backend for piecewise compilation: {self.backend}"
@@ -684,11 +718,11 @@ class CompilationConfig:
        self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size

    def set_splitting_ops_for_v1(self):
-        # NOTE: this function needs to be called only when level is
-        # CompilationLevel.PIECEWISE
-        assert self.level == CompilationLevel.PIECEWISE, (
+        # NOTE: this function needs to be called only when mode is
+        # CompilationMode.VLLM_COMPILE
+        assert self.mode == CompilationMode.VLLM_COMPILE, (
            "set_splitting_ops_for_v1 should only be called when "
-            "level is CompilationLevel.PIECEWISE"
+            "mode is CompilationMode.VLLM_COMPILE"
        )

        if self.use_inductor_graph_partition:
@@ -769,12 +803,10 @@ class CompilationConfig:

        if not self.use_inductor_graph_partition:
            # Dynamo-level FX split case
-            return self.level == CompilationLevel.PIECEWISE
+            return self.mode == CompilationMode.VLLM_COMPILE

        # Inductor partition case
-        return (
-            self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION
-        )
+        return self.backend == "inductor" and self.mode > CompilationMode.NONE

    def custom_op_log_check(self):
        """

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -22,7 +22,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid

 from .cache import CacheConfig
-from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode
+from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
 from .kv_events import KVEventsConfig
 from .kv_transfer import KVTransferConfig
@@ -84,17 +84,11 @@ class VllmConfig:
    compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
    """`torch.compile` and cudagraph capture configuration for the model.

-    As a shorthand, `-O<n>` can be used to directly specify the compilation
-    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
-    Currently, -O <n> and -O=<n> are supported as well but this will likely be
-    removed in favor of clearer -O<n> syntax in the future.
-
-    NOTE: level 0 is the default level without any optimization. level 1 and 2
-    are for internal testing only. level 3 is the recommended level for
-    production, also default in V1.
+    As a shorthand, one can append compilation arguments via 
+    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).

    You can specify the full compilation config like so:
-    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
+    `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
    kv_transfer_config: KVTransferConfig | None = None
    """The configurations for distributed KV cache transfer."""
@@ -305,33 +299,33 @@ class VllmConfig:
                "precision for chunked prefill triton kernels."
            )

-        # If the user does not explicitly set a compilation level, then
-        # we use the default level. The default level depends on other
+        # If the user does not explicitly set a compilation mode, then
+        # we use the default mode. The default mode depends on other
        # settings (see the below code).
-        if self.compilation_config.level is None:
+        if self.compilation_config.mode is None:
            if envs.VLLM_USE_V1:
                if (
                    self.model_config is not None
                    and not self.model_config.enforce_eager
                ):
-                    self.compilation_config.level = CompilationLevel.PIECEWISE
+                    self.compilation_config.mode = CompilationMode.VLLM_COMPILE
                else:
-                    self.compilation_config.level = CompilationLevel.NO_COMPILATION
+                    self.compilation_config.mode = CompilationMode.NONE

            else:
-                # NB: Passing both --enforce-eager and a compilation level
-                # in V0 means the compilation level wins out.
-                self.compilation_config.level = CompilationLevel.NO_COMPILATION
+                # NB: Passing both --enforce-eager and a compilation mode
+                # in V0 means the compilation mode wins out.
+                self.compilation_config.mode = CompilationMode.NONE
        else:
-            assert self.compilation_config.level >= CompilationLevel.NO_COMPILATION
-            assert self.compilation_config.level <= CompilationLevel.PIECEWISE
+            assert self.compilation_config.mode >= CompilationMode.NONE
+            assert self.compilation_config.mode <= CompilationMode.VLLM_COMPILE

        # If user does not set custom ops via none or all set it here based on
-        # compilation level and backend.
+        # compilation mode and backend.
        if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
            if (
                self.compilation_config.backend == "inductor"
-                and self.compilation_config.level > CompilationLevel.NO_COMPILATION
+                and self.compilation_config.mode > CompilationMode.NONE
            ):
                self.compilation_config.custom_ops.append("none")
            else:
@@ -350,7 +344,7 @@ class VllmConfig:
            if self.compilation_config.cudagraph_mode is None:
                if (
                    envs.VLLM_USE_V1
-                    and self.compilation_config.level == CompilationLevel.PIECEWISE
+                    and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
                ):
                    # default to full and piecewise for most models
                    self.compilation_config.cudagraph_mode = (
@@ -486,10 +480,10 @@ class VllmConfig:
            )
        current_platform.check_and_update_config(self)

-        # Do this after all the updates to compilation_config.level
+        # Do this after all the updates to compilation_config.mode
        if (
            envs.VLLM_USE_V1
-            and self.compilation_config.level == CompilationLevel.PIECEWISE
+            and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
        ):
            self.compilation_config.set_splitting_ops_for_v1()

@@ -508,8 +502,8 @@ class VllmConfig:
                )

            if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
-                assert self.compilation_config.level == CompilationLevel.PIECEWISE, (
-                    "Compilation level should be CompilationLevel.PIECEWISE "
+                assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, (
+                    "Compilation mode should be CompilationMode.VLLM_COMPILE "
                    "when cudagraph_mode piecewise cudagraphs is used, "
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                )
@@ -837,7 +831,7 @@ def set_current_vllm_config(

        if (
            check_compile
-            and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE
+            and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
            and compilation_counter.num_models_seen == num_models_seen
        ):
            # If the model supports compilation,

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -176,7 +176,7 @@ class LLM:
            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
            whichever is sooner.
        compilation_config: Either an integer or a dictionary. If it is an
-            integer, it is used as the level of compilation optimization. If it
+            integer, it is used as the mode of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
        **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].

@@ -257,9 +257,7 @@ class LLM:

        if compilation_config is not None:
            if isinstance(compilation_config, int):
-                compilation_config_instance = CompilationConfig(
-                    level=compilation_config
-                )
+                compilation_config_instance = CompilationConfig(mode=compilation_config)
            elif isinstance(compilation_config, dict):
                compilation_config_instance = CompilationConfig(
                    **{

--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -8,7 +8,7 @@ from packaging import version

 from vllm import _custom_ops as ops
 from vllm import envs
-from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.config import CompilationMode, get_current_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
@@ -419,7 +419,7 @@ class Fp8LinearOp:
        if pad_output is None:
            config = get_current_vllm_config().compilation_config
            pad_output = (
-                config.level < CompilationLevel.PIECEWISE
+                config.mode < CompilationMode.VLLM_COMPILE
                and self.preferred_backend == "torch"
            )


--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -247,12 +247,12 @@ class CpuPlatform(Platform):
            parallel_config.enable_dbo = False

        # Note: workaround for v1 gpu_model_runner
-        from vllm.config import CompilationLevel
+        from vllm.config import CompilationMode

        vllm_config.compilation_config.cudagraph_capture_sizes = []

        compilation_config = vllm_config.compilation_config
-        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE:
+        if vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
            # Note: vLLM V1 is using PIECEWISE level compilation, which will
            # take time to compile kernels just-in-time with the inductor
            # backend. For CPU CI tests, most of them are executed fast and
@@ -265,7 +265,7 @@ class CpuPlatform(Platform):
            else:
                backend = "inductor"

-            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
            compilation_config.backend = backend
            compilation_config.inductor_compile_config.update(
                {
@@ -277,7 +277,7 @@ class CpuPlatform(Platform):
            )

        if vllm_config.lora_config is not None:
-            compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.mode = CompilationMode.NONE

        assert vllm_config.device_config.device_type == "cpu"


--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -114,7 +114,7 @@ class TpuPlatform(Platform):

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.config import CompilationLevel, CUDAGraphMode
+        from vllm.config import CompilationMode, CUDAGraphMode

        cache_config = vllm_config.cache_config
        # For v0, the default block size is 16.
@@ -122,12 +122,13 @@ class TpuPlatform(Platform):
            cache_config.block_size = cast(BlockSize, 16)
        compilation_config = vllm_config.compilation_config

-        # TPU only supports DYNAMO_ONCE compilation level
-        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
+        # TPU only supports DYNAMO_TRACE_ONCE compilation mode
+        if compilation_config.mode != CompilationMode.DYNAMO_TRACE_ONCE:
            logger.info(
-                "[TPU] Forcing DYNAMO_ONCE compilation level, and disabling cudagraph."
+                "[TPU] Forcing DYNAMO_TRACE_ONCE compilation mode, and\
+                disabling cudagraph."
            )
-            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE

        if (
            compilation_config.cudagraph_mode is None

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -144,7 +144,7 @@ class XPUPlatform(Platform):
            cache_config.block_size = 64

        # lazy import to avoid circular import
-        from vllm.config import CompilationLevel, CUDAGraphMode
+        from vllm.config import CompilationMode, CUDAGraphMode

        compilation_config = vllm_config.compilation_config
        if compilation_config.compile_sizes is None:
@@ -155,7 +155,7 @@ class XPUPlatform(Platform):
        )

        if vllm_config.lora_config is not None:
-            compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.mode = CompilationMode.NONE

        # check and update parallel config
        parallel_config = vllm_config.parallel_config

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1686,16 +1686,16 @@ class FlexibleArgumentParser(ArgumentParser):
            elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
                # allow -O flag to be used without space, e.g. -O3 or -Odecode
                # -O.<...> handled later
-                # also handle -O=<level> here
-                level = arg[3:] if arg[2] == "=" else arg[2:]
-                processed_args.append(f"-O.level={level}")
+                # also handle -O=<mode> here
+                mode = arg[3:] if arg[2] == "=" else arg[2:]
+                processed_args.append(f"-O.mode={mode}")
            elif (
                arg == "-O"
                and i + 1 < len(args)
                and args[i + 1] in {"0", "1", "2", "3"}
            ):
-                # Convert -O <n> to -O.level <n>
-                processed_args.append("-O.level")
+                # Convert -O <n> to -O.mode <n>
+                processed_args.append("-O.mode")
            else:
                processed_args.append(arg)


--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -43,12 +43,12 @@ class CudagraphDispatcher:
            not_use_piecewise_compilation
            or self.compilation_config.is_attention_compiled_piecewise()
        ), (
-            "Compilation level should be CompilationLevel.PIECEWISE when "
+            "Compilation mode should be CompilationMode.VLLM_COMPILE when "
            "cudagraph_mode piecewise cudagraphs is used, "
            "and attention should be in splitting_ops or "
            "inductor splitting should be used. "
            f"cudagraph_mode={self.cudagraph_mode}, "
-            f"compilation_level={self.compilation_config.level}, "
+            f"compilation_mode={self.compilation_config.mode}, "
            f"splitting_ops={self.compilation_config.splitting_ops}"
        )