Unverified Commit 96b9aa5a authored by Morrison Turnansky's avatar Morrison Turnansky Committed by GitHub
Browse files

[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change ...


[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change  compilation level to compilation mode, deprecation compilation level (#26355)
Signed-off-by: default avatarmorrison-turnansky <mturnans@redhat.com>
Signed-off-by: default avatarMorrison Turnansky <mturnans@redhat.com>
Co-authored-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
parent e66d787b
...@@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc ...@@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
```python ```python
from vllm import LLM from vllm import LLM
from vllm.config import CompilationConfig, CompilationLevel from vllm.config import CompilationConfig, CompilationMode
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
# By default, it goes up to max_num_seqs # By default, it goes up to max_num_seqs
cudagraph_capture_sizes=[1, 2, 4, 8, 16], cudagraph_capture_sizes=[1, 2, 4, 8, 16],
), ),
......
...@@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum): ...@@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
"""NO CUDA Graphs support""" """NO CUDA Graphs support"""
``` ```
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
The following table lists backends that support full CUDA Graphs at the time of writing. The following table lists backends that support full CUDA Graphs at the time of writing.
...@@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG") ...@@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
import vllm import vllm
from vllm.config import CUDAGraphMode from vllm.config import CUDAGraphMode
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
model = vllm.LLM( model = vllm.LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
dtype="auto", dtype="auto",
......
...@@ -95,7 +95,7 @@ def parse_args(): ...@@ -95,7 +95,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--compilation-config", "--compilation-config",
type=int, type=int,
help=("Compilation optimization (O) level 0-3."), help=("Compilation optimization (O) mode 0-3."),
) )
parser.add_argument( parser.add_argument(
"--quantization", "--quantization",
......
...@@ -14,7 +14,7 @@ from vllm.compilation.counter import compilation_counter ...@@ -14,7 +14,7 @@ from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
...@@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): ...@@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
outputs = [] outputs = []
# piecewise compile # vllmcompile compile
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
...@@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): ...@@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
# no compile or cudagraph # no compile or cudagraph
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.NO_COMPILATION, mode=CompilationMode.NONE,
) )
) )
cudagraph_runtime_mode = CUDAGraphMode.NONE cudagraph_runtime_mode = CUDAGraphMode.NONE
...@@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): ...@@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
# piecewise compile without CUDA graph # piecewise compile without CUDA graph
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=False, use_cudagraph=False,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
use_inductor_graph_partition=use_inductor_graph_partition, use_inductor_graph_partition=use_inductor_graph_partition,
......
...@@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter ...@@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
...@@ -61,7 +61,7 @@ def _run_simple_model( ...@@ -61,7 +61,7 @@ def _run_simple_model(
): ):
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
use_inductor=use_inductor, use_inductor=use_inductor,
splitting_ops=splitting_ops, splitting_ops=splitting_ops,
......
...@@ -21,7 +21,7 @@ from vllm.compilation.counter import compilation_counter ...@@ -21,7 +21,7 @@ from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
...@@ -356,13 +356,13 @@ def test_toy_llama( ...@@ -356,13 +356,13 @@ def test_toy_llama(
) )
compile_config_no_compile = CompilationConfig( compile_config_no_compile = CompilationConfig(
level=CompilationLevel.NO_COMPILATION, level=CompilationMode.NONE,
cudagraph_mode=CUDAGraphMode.NONE, cudagraph_mode=CUDAGraphMode.NONE,
backend="eager", backend="eager",
) )
compile_config_no_split = CompilationConfig( compile_config_no_split = CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=use_inductor_graph_partition, use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_mode=CUDAGraphMode.PIECEWISE, cudagraph_mode=CUDAGraphMode.PIECEWISE,
backend=backend, backend=backend,
...@@ -458,14 +458,14 @@ def benchmark(): ...@@ -458,14 +458,14 @@ def benchmark():
for piecewise in [False, True]: for piecewise in [False, True]:
if piecewise: if piecewise:
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=cudagraph_sizes, cudagraph_capture_sizes=cudagraph_sizes,
) )
else: else:
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
cudagraph_capture_sizes=cudagraph_sizes, cudagraph_capture_sizes=cudagraph_sizes,
) )
......
...@@ -10,7 +10,7 @@ import torch ...@@ -10,7 +10,7 @@ import torch
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
...@@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module): ...@@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
def make_vllm_config() -> VllmConfig: def make_vllm_config() -> VllmConfig:
return VllmConfig( return VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
) )
) )
......
...@@ -10,6 +10,7 @@ import vllm.envs as envs ...@@ -10,6 +10,7 @@ import vllm.envs as envs
from vllm.compilation.collective_fusion import AsyncTPPass from vllm.compilation.collective_fusion import AsyncTPPass
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationMode,
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
...@@ -400,7 +401,7 @@ def test_async_tp_pass_correctness( ...@@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
common_args.append("--enforce-eager") common_args.append("--enforce-eager")
compilation_config = { compilation_config = {
"level": 3, "mode": CompilationMode.VLLM_COMPILE,
"compile_sizes": [2, 4, 8], "compile_sizes": [2, 4, 8],
"splitting_ops": [], "splitting_ops": [],
"pass_config": {"enable_async_tp": async_tp_enabled}, "pass_config": {"enable_async_tp": async_tp_enabled},
......
...@@ -4,7 +4,7 @@ import dataclasses ...@@ -4,7 +4,7 @@ import dataclasses
import pytest import pytest
from vllm.config import CompilationLevel from vllm.config import CompilationMode
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from ..utils import compare_all_settings from ..utils import compare_all_settings
...@@ -21,7 +21,7 @@ class TestSetting: ...@@ -21,7 +21,7 @@ class TestSetting:
# we cannot afford testing the full Cartesian product # we cannot afford testing the full Cartesian product
# of all models and all levels # of all models and all modes
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_setting", "test_setting",
[ [
...@@ -121,15 +121,13 @@ def test_compile_correctness( ...@@ -121,15 +121,13 @@ def test_compile_correctness(
all_args: list[list[str]] = [] all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = [] all_envs: list[dict[str, str] | None] = []
for comp_level in [ for comp_mode in [
CompilationLevel.DYNAMO_AS_IS, CompilationMode.STOCK_TORCH_COMPILE,
CompilationLevel.DYNAMO_ONCE, CompilationMode.DYNAMO_TRACE_ONCE,
CompilationLevel.PIECEWISE, CompilationMode.VLLM_COMPILE,
]: ]:
for level in [CompilationLevel.NO_COMPILATION, comp_level]: for mode in [CompilationMode.NONE, comp_mode]:
all_args.append( all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
final_args + [f"-O.level={level}", "-O.backend=inductor"]
)
# inductor will change the output, so we only compare if the output # inductor will change the output, so we only compare if the output
# is close, not exactly the same. # is close, not exactly the same.
...@@ -142,13 +140,13 @@ def test_compile_correctness( ...@@ -142,13 +140,13 @@ def test_compile_correctness(
all_envs.clear() all_envs.clear()
all_args.clear() all_args.clear()
for level in [ for mode in [
CompilationLevel.NO_COMPILATION, CompilationMode.NONE,
CompilationLevel.DYNAMO_AS_IS, CompilationMode.STOCK_TORCH_COMPILE,
CompilationLevel.DYNAMO_ONCE, CompilationMode.DYNAMO_TRACE_ONCE,
CompilationLevel.PIECEWISE, CompilationMode.VLLM_COMPILE,
]: ]:
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"]) all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
all_envs.append({}) all_envs.append({})
all_envs.append({}) all_envs.append({})
......
...@@ -4,7 +4,7 @@ import pytest ...@@ -4,7 +4,7 @@ import pytest
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
from vllm.config.compilation import CompilationLevel from vllm.config.compilation import CompilationMode
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
...@@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): ...@@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked @pytest.mark.forked
def test_dynamo_as_is(vllm_runner, monkeypatch): def test_stock_torch_compile(vllm_runner, monkeypatch):
# Disable multiprocessing so that the counter is in the same process # Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with ( with (
compilation_counter.expect(dynamo_as_is_count=1), compilation_counter.expect(stock_torch_compile_count=1),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner( vllm_runner(
"facebook/opt-125m", "facebook/opt-125m",
compilation_config={"level": 1}, compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
) as _, ) as _,
): ):
...@@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch): ...@@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
# Disable multiprocessing so that the counter is in the same process # Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with ( with (
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner( vllm_runner(
"facebook/opt-125m", "facebook/opt-125m",
compilation_config={"level": 0}, compilation_config={"mode": CompilationMode.NONE},
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
) as _, ) as _,
): ):
...@@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch): ...@@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with ( with (
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner( vllm_runner(
"facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4 "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
...@@ -151,7 +151,7 @@ def test_splitting_ops_dynamic(): ...@@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
if is_torch_equal_or_newer("2.9.0.dev"): if is_torch_equal_or_newer("2.9.0.dev"):
config = VllmConfig( config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
splitting_ops=["vllm::unified_attention"], splitting_ops=["vllm::unified_attention"],
) )
...@@ -163,7 +163,7 @@ def test_splitting_ops_dynamic(): ...@@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
# When attn_fusion pass enabled, splitting_ops now default to attention ops. # When attn_fusion pass enabled, splitting_ops now default to attention ops.
config = VllmConfig( config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
pass_config={"enable_attn_fusion": True, "enable_noop": True}, pass_config={"enable_attn_fusion": True, "enable_noop": True},
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],
cudagraph_mode=CUDAGraphMode.PIECEWISE, cudagraph_mode=CUDAGraphMode.PIECEWISE,
...@@ -178,7 +178,7 @@ def test_splitting_ops_dynamic(): ...@@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
if is_torch_equal_or_newer("2.9.0.dev"): if is_torch_equal_or_newer("2.9.0.dev"):
config = VllmConfig( config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
pass_config={"enable_attn_fusion": True, "enable_noop": True}, pass_config={"enable_attn_fusion": True, "enable_noop": True},
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],
......
...@@ -8,7 +8,7 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp ...@@ -8,7 +8,7 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp
from vllm.config import ( from vllm.config import (
CacheConfig, CacheConfig,
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
...@@ -66,10 +66,10 @@ def run_model( ...@@ -66,10 +66,10 @@ def run_model(
def test_ignore_torch_compile_decorator(): def test_ignore_torch_compile_decorator():
# piecewise # vllmcompile
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
...@@ -185,7 +185,7 @@ def test_conditional_compile_enable_if(): ...@@ -185,7 +185,7 @@ def test_conditional_compile_enable_if():
kv_sharing_fast_prefill=True, kv_sharing_fast_prefill=True,
), ),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
...@@ -218,7 +218,7 @@ def test_conditional_compile_enable_if(): ...@@ -218,7 +218,7 @@ def test_conditional_compile_enable_if():
kv_sharing_fast_prefill=False, kv_sharing_fast_prefill=False,
), ),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
......
...@@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported ...@@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import _Backend from vllm.attention.backends.registry import _Backend
from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.attention.selector import global_force_attn_backend_context_manager
from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_torch_equal_or_newer from vllm.utils import is_torch_equal_or_newer
...@@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): ...@@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "compilation_mode",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
) )
@pytest.mark.parametrize("model_info", models_list(all=True)) @pytest.mark.parametrize("model_info", models_list(all=True))
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_full_graph( def test_full_graph(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
model_info: tuple[str, dict[str, Any]], model_info: tuple[str, dict[str, Any]],
optimization_level: int, compilation_mode: int,
): ):
model, model_kwargs = model_info model, model_kwargs = model_info
with monkeypatch.context(): with monkeypatch.context():
print(f"MODEL={model}") print(f"MODEL={model}")
run_model(optimization_level, model, model_kwargs) run_model(compilation_mode, model, model_kwargs)
# TODO(luka) add other supported compilation config scenarios here # TODO(luka) add other supported compilation config scenarios here
...@@ -104,7 +104,7 @@ def test_full_graph( ...@@ -104,7 +104,7 @@ def test_full_graph(
[ [
# additional compile sizes, only some of the models # additional compile sizes, only some of the models
( (
CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]), CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
model, model,
) )
for model in models_list(all=False) for model in models_list(all=False)
...@@ -113,7 +113,7 @@ def test_full_graph( ...@@ -113,7 +113,7 @@ def test_full_graph(
# RMSNorm + quant fusion, only 8-bit quant models # RMSNorm + quant fusion, only 8-bit quant models
( (
CompilationConfig( CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm"], custom_ops=["+rms_norm"],
pass_config=PassConfig(enable_fusion=True, enable_noop=True), pass_config=PassConfig(enable_fusion=True, enable_noop=True),
), ),
...@@ -125,7 +125,8 @@ def test_full_graph( ...@@ -125,7 +125,8 @@ def test_full_graph(
# Test depyf integration works # Test depyf integration works
( (
CompilationConfig( CompilationConfig(
level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir() mode=CompilationMode.VLLM_COMPILE,
debug_dump_path=tempfile.gettempdir(),
), ),
("facebook/opt-125m", {}), ("facebook/opt-125m", {}),
), ),
...@@ -134,7 +135,7 @@ def test_full_graph( ...@@ -134,7 +135,7 @@ def test_full_graph(
# graph inductor partition # graph inductor partition
( (
CompilationConfig( CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
# inductor graph partition uses # inductor graph partition uses
# torch._C.Tag.cudagraph_unsafe to specify splitting ops # torch._C.Tag.cudagraph_unsafe to specify splitting ops
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
...@@ -164,10 +165,10 @@ def test_custom_compile_config( ...@@ -164,10 +165,10 @@ def test_custom_compile_config(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "compilation_mode",
[CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE], [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
) )
def test_fp8_kv_scale_compile(optimization_level: int): def test_fp8_kv_scale_compile(compilation_mode: int):
model = "Qwen/Qwen2-0.5B" model = "Qwen/Qwen2-0.5B"
model_kwargs = { model_kwargs = {
"quantization": "fp8", "quantization": "fp8",
...@@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int): ...@@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int):
"calculate_kv_scales": True, "calculate_kv_scales": True,
"max_model_len": 512, "max_model_len": 512,
} }
run_model(optimization_level, model, model_kwargs) run_model(compilation_mode, model, model_kwargs)
def test_inductor_graph_partition_attn_fusion(caplog_vllm): def test_inductor_graph_partition_attn_fusion(caplog_vllm):
...@@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): ...@@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
cudagraph_mode=CUDAGraphMode.PIECEWISE, cudagraph_mode=CUDAGraphMode.PIECEWISE,
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],
......
...@@ -13,7 +13,7 @@ from vllm.compilation.fusion import ( ...@@ -13,7 +13,7 @@ from vllm.compilation.fusion import (
) )
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, GroupShape,
...@@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant( ...@@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm", "+quant_fp8"], custom_ops=["+rms_norm", "+quant_fp8"],
pass_config=PassConfig(enable_fusion=True, enable_noop=True), pass_config=PassConfig(enable_fusion=True, enable_noop=True),
) )
......
...@@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass ...@@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
...@@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model( ...@@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"] mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
) )
) )
vllm_config.compilation_config.pass_config = PassConfig( vllm_config.compilation_config.pass_config = PassConfig(
......
...@@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass ...@@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.config import ( from vllm.config import (
CacheConfig, CacheConfig,
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
SchedulerConfig, SchedulerConfig,
...@@ -321,7 +321,7 @@ def test_attention_quant_pattern( ...@@ -321,7 +321,7 @@ def test_attention_quant_pattern(
), ),
scheduler_config=SchedulerConfig(max_num_seqs=1024), scheduler_config=SchedulerConfig(max_num_seqs=1024),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],
use_inductor_graph_partition=use_inductor_graph_partition, use_inductor_graph_partition=use_inductor_graph_partition,
), ),
......
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
import vllm import vllm
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
from .backend import TestBackend from .backend import TestBackend
...@@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size): ...@@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(enable_noop=True), pass_config=PassConfig(enable_noop=True),
) )
) )
...@@ -98,7 +98,7 @@ def test_non_noop_slice_preserved(): ...@@ -98,7 +98,7 @@ def test_non_noop_slice_preserved():
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(enable_noop=True), pass_config=PassConfig(enable_noop=True),
) )
) )
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
import torch import torch
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
from vllm.config import CompilationLevel from vllm.config import CompilationMode
class MyMod(torch.nn.Module): class MyMod(torch.nn.Module):
...@@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): ...@@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
self.model = model self.model = model
compiled_callable = torch.compile(self.forward, backend="eager") compiled_callable = torch.compile(self.forward, backend="eager")
super().__init__( super().__init__(
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
) )
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
......
...@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple ...@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple
import pytest import pytest
from vllm.config.compilation import CompilationMode
from vllm.config.model import RunnerOption from vllm.config.model import RunnerOption
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -234,7 +235,7 @@ def _compare_sp( ...@@ -234,7 +235,7 @@ def _compare_sp(
common_args.append("--skip-tokenizer-init") common_args.append("--skip-tokenizer-init")
compilation_config = { compilation_config = {
"level": 3, "mode": CompilationMode.VLLM_COMPILE,
"custom_ops": ["+rms_norm"], "custom_ops": ["+rms_norm"],
"compile_sizes": [4, 8], "compile_sizes": [4, 8],
"pass_config": { "pass_config": {
......
...@@ -226,30 +226,30 @@ def test_compilation_config(): ...@@ -226,30 +226,30 @@ def test_compilation_config():
# set to O3 # set to O3
args = parser.parse_args(["-O0"]) args = parser.parse_args(["-O0"])
assert args.compilation_config.level == 0 assert args.compilation_config.mode == 0
# set to O 3 (space) # set to O 3 (space)
args = parser.parse_args(["-O", "1"]) args = parser.parse_args(["-O", "1"])
assert args.compilation_config.level == 1 assert args.compilation_config.mode == 1
# set to O 3 (equals) # set to O 3 (equals)
args = parser.parse_args(["-O=2"]) args = parser.parse_args(["-O=2"])
assert args.compilation_config.level == 2 assert args.compilation_config.mode == 2
# set to O.level 3 # set to O.mode 3
args = parser.parse_args(["-O.level", "3"]) args = parser.parse_args(["-O.mode", "3"])
assert args.compilation_config.level == 3 assert args.compilation_config.mode == 3
# set to string form of a dict # set to string form of a dict
args = parser.parse_args( args = parser.parse_args(
[ [
"-O", "-O",
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": false}', '"use_inductor": false}',
] ]
) )
assert ( assert (
args.compilation_config.level == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and not args.compilation_config.use_inductor and not args.compilation_config.use_inductor
) )
...@@ -258,12 +258,12 @@ def test_compilation_config(): ...@@ -258,12 +258,12 @@ def test_compilation_config():
args = parser.parse_args( args = parser.parse_args(
[ [
"--compilation-config=" "--compilation-config="
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}', '"use_inductor": true}',
] ]
) )
assert ( assert (
args.compilation_config.level == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and args.compilation_config.use_inductor and args.compilation_config.use_inductor
) )
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from vllm.config import CompilationLevel from vllm.config import CompilationMode
from ..utils import compare_two_settings from ..utils import compare_two_settings
...@@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): ...@@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
"--max-model-len=256", "--max-model-len=256",
"--max-num-seqs=32", "--max-num-seqs=32",
"--enforce-eager", "--enforce-eager",
f"-O{CompilationLevel.DYNAMO_ONCE}", f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
], ],
arg2=[ arg2=[
"--max-model-len=256", "--max-model-len=256",
"--max-num-seqs=32", "--max-num-seqs=32",
"--enforce-eager", "--enforce-eager",
f"-O{CompilationLevel.DYNAMO_AS_IS}", f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
], ],
env1={}, env1={},
env2={}, env2={},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment