[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change ...

[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355) Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>

[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change ...
[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355) Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
96b9aa5a · Morrison Turnansky · GitHub · e66d787b · 96b9aa5a · 96b9aa5a
Unverified Commit 96b9aa5a authored Oct 14, 2025 by Morrison Turnansky Committed by GitHub Oct 15, 2025
20 changed files
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
    ```python
    from vllm import LLM
-    from vllm.config import CompilationConfig, CompilationLevel
+    from vllm.config import CompilationConfig, CompilationMode
    llm = LLM(
        model="meta-llama/Llama-3.1-8B-Instruct",
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            # By default, it goes up to max_num_seqs
            cudagraph_capture_sizes=[1, 2, 4, 8, 16],
        ),

--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
    """NO CUDA Graphs support"""
 ```
-Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
+Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
 The following table lists backends that support full CUDA Graphs at the time of writing.
@@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
 import vllm
 from vllm.config import CUDAGraphMode
-compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
+compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
 model = vllm.LLM(
    model="meta-llama/Llama-3.1-8B-Instruct",
    dtype="auto",

--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -95,7 +95,7 @@ def parse_args():
    parser.add_argument(
        "--compilation-config",
        type=int,
-        help=("Compilation optimization (O) level 0-3."),
+        help=("Compilation optimization (O) mode 0-3."),
    )
    parser.add_argument(
        "--quantization",

--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -14,7 +14,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
 from vllm.config import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    CUDAGraphMode,
    VllmConfig,
    set_current_vllm_config,
@@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
    outputs = []
-    # piecewise compile
+    # vllmcompile compile
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            use_cudagraph=True,
            splitting_ops=["silly::attention"],
            cudagraph_capture_sizes=[1, 2],
@@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
    # no compile or cudagraph
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.NO_COMPILATION,
+            mode=CompilationMode.NONE,
        )
    )
    cudagraph_runtime_mode = CUDAGraphMode.NONE
@@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
    # piecewise compile without CUDA graph
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            use_cudagraph=False,
            splitting_ops=["silly::attention"],
            use_inductor_graph_partition=use_inductor_graph_partition,

--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    CUDAGraphMode,
    VllmConfig,
    set_current_vllm_config,
@@ -61,7 +61,7 @@ def _run_simple_model(
 ):
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            use_cudagraph=True,
            use_inductor=use_inductor,
            splitting_ops=splitting_ops,

--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -21,7 +21,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    CUDAGraphMode,
    VllmConfig,
    set_current_vllm_config,
@@ -356,13 +356,13 @@ def test_toy_llama(
    )
    compile_config_no_compile = CompilationConfig(
-        level=CompilationLevel.NO_COMPILATION,
+        level=CompilationMode.NONE,
        cudagraph_mode=CUDAGraphMode.NONE,
        backend="eager",
    )
    compile_config_no_split = CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
+        level=CompilationMode.VLLM_COMPILE,
        use_inductor_graph_partition=use_inductor_graph_partition,
        cudagraph_mode=CUDAGraphMode.PIECEWISE,
        backend=backend,
@@ -458,14 +458,14 @@ def benchmark():
    for piecewise in [False, True]:
        if piecewise:
            compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                use_cudagraph=True,
                splitting_ops=["silly::attention"],
                cudagraph_capture_sizes=cudagraph_sizes,
            )
        else:
            compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                cudagraph_capture_sizes=cudagraph_sizes,
            )

--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -10,7 +10,7 @@ import torch
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    VllmConfig,
    set_current_vllm_config,
 )
@@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
 def make_vllm_config() -> VllmConfig:
    return VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
        )
    )

--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -10,6 +10,7 @@ import vllm.envs as envs
 from vllm.compilation.collective_fusion import AsyncTPPass
 from vllm.config import (
    CompilationConfig,
+    CompilationMode,
    DeviceConfig,
    ModelConfig,
    PassConfig,
@@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
        common_args.append("--enforce-eager")
    compilation_config = {
-        "level": 3,
+        "mode": CompilationMode.VLLM_COMPILE,
        "compile_sizes": [2, 4, 8],
        "splitting_ops": [],
        "pass_config": {"enable_async_tp": async_tp_enabled},

--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -4,7 +4,7 @@ import dataclasses
 import pytest
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 from vllm.utils import cuda_device_count_stateless
 from ..utils import compare_all_settings
@@ -21,7 +21,7 @@ class TestSetting:
 # we cannot afford testing the full Cartesian product
-# of all models and all levels
+# of all models and all modes
 @pytest.mark.parametrize(
    "test_setting",
    [
@@ -121,15 +121,13 @@ def test_compile_correctness(
        all_args: list[list[str]] = []
        all_envs: list[dict[str, str] | None] = []
-        for comp_level in [
+        for comp_mode in [
-            CompilationLevel.DYNAMO_AS_IS,
+            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationLevel.DYNAMO_ONCE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationLevel.PIECEWISE,
+            CompilationMode.VLLM_COMPILE,
        ]:
-            for level in [CompilationLevel.NO_COMPILATION, comp_level]:
+            for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(
+                all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
-                    final_args + [f"-O.level={level}", "-O.backend=inductor"]
-                )
            # inductor will change the output, so we only compare if the output
            # is close, not exactly the same.
@@ -142,13 +140,13 @@ def test_compile_correctness(
            all_envs.clear()
            all_args.clear()
-        for level in [
+        for mode in [
-            CompilationLevel.NO_COMPILATION,
+            CompilationMode.NONE,
-            CompilationLevel.DYNAMO_AS_IS,
+            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationLevel.DYNAMO_ONCE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationLevel.PIECEWISE,
+            CompilationMode.VLLM_COMPILE,
        ]:
-            all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
            all_envs.append({})
            all_envs.append({})

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -4,7 +4,7 @@ import pytest
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.config.compilation import CompilationLevel
+from vllm.config.compilation import CompilationMode
 from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
@@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
-def test_dynamo_as_is(vllm_runner, monkeypatch):
+def test_stock_torch_compile(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    with (
-        compilation_counter.expect(dynamo_as_is_count=1),
+        compilation_counter.expect(stock_torch_compile_count=1),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
-            compilation_config={"level": 1},
+            compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
            gpu_memory_utilization=0.4,
        ) as _,
    ):
@@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    with (
-        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m",
-            compilation_config={"level": 0},
+            compilation_config={"mode": CompilationMode.NONE},
            gpu_memory_utilization=0.4,
        ) as _,
    ):
@@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    with (
-        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
        # loading the model causes compilation (if enabled) to happen
        vllm_runner(
            "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
@@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
    if is_torch_equal_or_newer("2.9.0.dev"):
        config = VllmConfig(
            compilation_config=CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                use_inductor_graph_partition=True,
                splitting_ops=["vllm::unified_attention"],
            )
@@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
    # When attn_fusion pass enabled, splitting_ops now default to attention ops.
    config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
            pass_config={"enable_attn_fusion": True, "enable_noop": True},
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
@@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
    if is_torch_equal_or_newer("2.9.0.dev"):
        config = VllmConfig(
            compilation_config=CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                use_inductor_graph_partition=True,
                pass_config={"enable_attn_fusion": True, "enable_noop": True},
                custom_ops=["+quant_fp8"],

--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -8,7 +8,7 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp
 from vllm.config import (
    CacheConfig,
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    CUDAGraphMode,
    VllmConfig,
    set_current_vllm_config,
@@ -66,10 +66,10 @@ def run_model(
 def test_ignore_torch_compile_decorator():
-    # piecewise
+    # vllmcompile
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            use_cudagraph=True,
            splitting_ops=["silly::attention"],
            cudagraph_capture_sizes=[1, 2],
@@ -185,7 +185,7 @@ def test_conditional_compile_enable_if():
            kv_sharing_fast_prefill=True,
        ),
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            use_cudagraph=True,
            splitting_ops=["silly::attention"],
            cudagraph_capture_sizes=[1, 2],
@@ -218,7 +218,7 @@ def test_conditional_compile_enable_if():
            kv_sharing_fast_prefill=False,
        ),
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            use_cudagraph=True,
            splitting_ops=["silly::attention"],
            cudagraph_capture_sizes=[1, 2],

--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import global_force_attn_backend_context_manager
-from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer
@@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
 @pytest.mark.parametrize(
-    "optimization_level",
+    "compilation_mode",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+    [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
 )
 @pytest.mark.parametrize("model_info", models_list(all=True))
 @create_new_process_for_each_test()
 def test_full_graph(
    monkeypatch: pytest.MonkeyPatch,
    model_info: tuple[str, dict[str, Any]],
-    optimization_level: int,
+    compilation_mode: int,
 ):
    model, model_kwargs = model_info
    with monkeypatch.context():
        print(f"MODEL={model}")
-        run_model(optimization_level, model, model_kwargs)
+        run_model(compilation_mode, model, model_kwargs)
 # TODO(luka) add other supported compilation config scenarios here
@@ -104,7 +104,7 @@ def test_full_graph(
    [
        # additional compile sizes, only some of the models
        (
-            CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]),
+            CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
            model,
        )
        for model in models_list(all=False)
@@ -113,7 +113,7 @@ def test_full_graph(
        # RMSNorm + quant fusion, only 8-bit quant models
        (
            CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                custom_ops=["+rms_norm"],
                pass_config=PassConfig(enable_fusion=True, enable_noop=True),
            ),
@@ -125,7 +125,8 @@ def test_full_graph(
        # Test depyf integration works
        (
            CompilationConfig(
-                level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()
+                mode=CompilationMode.VLLM_COMPILE,
+                debug_dump_path=tempfile.gettempdir(),
            ),
            ("facebook/opt-125m", {}),
        ),
@@ -134,7 +135,7 @@ def test_full_graph(
        # graph inductor partition
        (
            CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                # inductor graph partition uses
                # torch._C.Tag.cudagraph_unsafe to specify splitting ops
                use_inductor_graph_partition=True,
@@ -164,10 +165,10 @@ def test_custom_compile_config(
 @pytest.mark.parametrize(
-    "optimization_level",
+    "compilation_mode",
-    [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
+    [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
 )
-def test_fp8_kv_scale_compile(optimization_level: int):
+def test_fp8_kv_scale_compile(compilation_mode: int):
    model = "Qwen/Qwen2-0.5B"
    model_kwargs = {
        "quantization": "fp8",
@@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int):
        "calculate_kv_scales": True,
        "max_model_len": 512,
    }
-    run_model(optimization_level, model, model_kwargs)
+    run_model(compilation_mode, model, model_kwargs)
 def test_inductor_graph_partition_attn_fusion(caplog_vllm):
@@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
    model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
    compilation_config = CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
+        mode=CompilationMode.VLLM_COMPILE,
        use_inductor_graph_partition=True,
        cudagraph_mode=CUDAGraphMode.PIECEWISE,
        custom_ops=["+quant_fp8"],

--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -13,7 +13,7 @@ from vllm.compilation.fusion import (
 )
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
+from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape,
@@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            custom_ops=["+rms_norm", "+quant_fp8"],
            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
        )

--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    DeviceConfig,
    ModelConfig,
    PassConfig,
@@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"]
+            mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
        )
    )
    vllm_config.compilation_config.pass_config = PassConfig(

--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (
    CacheConfig,
    CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
    ModelConfig,
    PassConfig,
    SchedulerConfig,
@@ -321,7 +321,7 @@ def test_attention_quant_pattern(
        ),
        scheduler_config=SchedulerConfig(max_num_seqs=1024),
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            custom_ops=["+quant_fp8"],
            use_inductor_graph_partition=use_inductor_graph_partition,
        ),

--- a/tests/compile/test_noop_elimination.py
+++ b/tests/compile/test_noop_elimination.py
@@ -6,7 +6,7 @@ import torch
 import vllm
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
+from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
 from .backend import TestBackend
@@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            pass_config=PassConfig(enable_noop=True),
        )
    )
@@ -98,7 +98,7 @@ def test_non_noop_slice_preserved():
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
            pass_config=PassConfig(enable_noop=True),
        )
    )

--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -5,7 +5,7 @@
 import torch
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 class MyMod(torch.nn.Module):
@@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
        self.model = model
        compiled_callable = torch.compile(self.forward, backend="eager")
        super().__init__(
-            compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
+            compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
        )
    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):

--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple
 import pytest
+from vllm.config.compilation import CompilationMode
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
@@ -234,7 +235,7 @@ def _compare_sp(
        common_args.append("--skip-tokenizer-init")
    compilation_config = {
-        "level": 3,
+        "mode": CompilationMode.VLLM_COMPILE,
        "custom_ops": ["+rms_norm"],
        "compile_sizes": [4, 8],
        "pass_config": {

--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -226,30 +226,30 @@ def test_compilation_config():
    # set to O3
    args = parser.parse_args(["-O0"])
-    assert args.compilation_config.level == 0
+    assert args.compilation_config.mode == 0
    # set to O 3 (space)
    args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.level == 1
+    assert args.compilation_config.mode == 1
    # set to O 3 (equals)
    args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.level == 2
+    assert args.compilation_config.mode == 2
-    # set to O.level 3
+    # set to O.mode 3
-    args = parser.parse_args(["-O.level", "3"])
+    args = parser.parse_args(["-O.mode", "3"])
-    assert args.compilation_config.level == 3
+    assert args.compilation_config.mode == 3
    # set to string form of a dict
    args = parser.parse_args(
        [
            "-O",
-            '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
            '"use_inductor": false}',
        ]
    )
    assert (
-        args.compilation_config.level == 3
+        args.compilation_config.mode == 3
        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
        and not args.compilation_config.use_inductor
    )
@@ -258,12 +258,12 @@ def test_compilation_config():
    args = parser.parse_args(
        [
            "--compilation-config="
-            '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
            '"use_inductor": true}',
        ]
    )
    assert (
-        args.compilation_config.level == 3
+        args.compilation_config.mode == 3
        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
        and args.compilation_config.use_inductor
    )

--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -3,7 +3,7 @@
 import pytest
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 from ..utils import compare_two_settings
@@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
                "--max-model-len=256",
                "--max-num-seqs=32",
                "--enforce-eager",
-                f"-O{CompilationLevel.DYNAMO_ONCE}",
+                f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
            ],
            arg2=[
                "--max-model-len=256",
                "--max-num-seqs=32",
                "--enforce-eager",
-                f"-O{CompilationLevel.DYNAMO_AS_IS}",
+                f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
            ],
            env1={},
            env2={},