Unverified Commit 51fc9e01 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Scheduled removal of `CompilationConfig.use_inductor` (#29323)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent bf0c75cd
...@@ -55,7 +55,7 @@ class SillyModel(nn.Module): ...@@ -55,7 +55,7 @@ class SillyModel(nn.Module):
def _run_simple_model( def _run_simple_model(
splitting_ops, splitting_ops,
use_inductor_graph_partition, use_inductor_graph_partition,
use_inductor, backend,
expected_num_piecewise_graphs_seen, expected_num_piecewise_graphs_seen,
expected_num_piecewise_capturable_graphs_seen, expected_num_piecewise_capturable_graphs_seen,
expected_num_backend_compilations, expected_num_backend_compilations,
...@@ -64,7 +64,7 @@ def _run_simple_model( ...@@ -64,7 +64,7 @@ def _run_simple_model(
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
use_inductor=use_inductor, backend=backend,
splitting_ops=splitting_ops, splitting_ops=splitting_ops,
use_inductor_graph_partition=use_inductor_graph_partition, use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_copy_inputs=True, cudagraph_copy_inputs=True,
...@@ -124,14 +124,14 @@ def _run_simple_model( ...@@ -124,14 +124,14 @@ def _run_simple_model(
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
@pytest.mark.parametrize("use_inductor", [True, False]) @pytest.mark.parametrize("backend", ["inductor", "eager"])
@torch.inference_mode() @torch.inference_mode()
@create_new_process_for_each_test("spawn") @create_new_process_for_each_test("spawn")
def test_simple_piecewise_compile(use_inductor): def test_simple_piecewise_compile(backend):
_run_simple_model( _run_simple_model(
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
use_inductor_graph_partition=False, use_inductor_graph_partition=False,
use_inductor=use_inductor, backend=backend,
# 2 * num_layers + 1 # 2 * num_layers + 1
expected_num_piecewise_graphs_seen=5, expected_num_piecewise_graphs_seen=5,
# 1 + num_layers # 1 + num_layers
...@@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch): ...@@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
_run_simple_model( _run_simple_model(
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
use_inductor=True, backend="inductor",
# Since not splitting at fx graph level # Since not splitting at fx graph level
expected_num_piecewise_graphs_seen=1, expected_num_piecewise_graphs_seen=1,
# Since not splitting at fx graph level # Since not splitting at fx graph level
......
...@@ -249,14 +249,13 @@ def test_compilation_config(): ...@@ -249,14 +249,13 @@ def test_compilation_config():
args = parser.parse_args( args = parser.parse_args(
[ [
"-O", "-O",
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
'"use_inductor": false}',
] ]
) )
assert ( assert (
args.compilation_config.mode == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and not args.compilation_config.use_inductor and args.compilation_config.backend == "eager"
) )
# set to string form of a dict # set to string form of a dict
...@@ -264,13 +263,13 @@ def test_compilation_config(): ...@@ -264,13 +263,13 @@ def test_compilation_config():
[ [
"--compilation-config=" "--compilation-config="
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}', '"backend": "inductor"}',
] ]
) )
assert ( assert (
args.compilation_config.mode == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and args.compilation_config.use_inductor and args.compilation_config.backend == "inductor"
) )
......
...@@ -166,7 +166,7 @@ def test_dict_args(parser): ...@@ -166,7 +166,7 @@ def test_dict_args(parser):
"--hf-overrides.key2.key4", "--hf-overrides.key2.key4",
"val3", "val3",
# Test compile config and compilation mode # Test compile config and compilation mode
"-O.use_inductor=true", "-O.use_inductor_graph_partition=true",
"-O.backend", "-O.backend",
"custom", "custom",
"-O1", "-O1",
...@@ -219,7 +219,7 @@ def test_dict_args(parser): ...@@ -219,7 +219,7 @@ def test_dict_args(parser):
} }
assert parsed_args.compilation_config == { assert parsed_args.compilation_config == {
"mode": 1, "mode": 1,
"use_inductor": True, "use_inductor_graph_partition": True,
"backend": "custom", "backend": "custom",
"custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
} }
......
...@@ -264,7 +264,6 @@ class CompilationConfig: ...@@ -264,7 +264,6 @@ class CompilationConfig:
- [`cudagraph_copy_inputs`] - [`cudagraph_copy_inputs`]
[vllm.config.CompilationConfig.cudagraph_copy_inputs] [vllm.config.CompilationConfig.cudagraph_copy_inputs]
- Inductor compilation: - Inductor compilation:
- [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
- [`inductor_compile_config`] - [`inductor_compile_config`]
[vllm.config.CompilationConfig.inductor_compile_config] [vllm.config.CompilationConfig.inductor_compile_config]
...@@ -348,7 +347,7 @@ class CompilationConfig: ...@@ -348,7 +347,7 @@ class CompilationConfig:
- 'none,+op1,+op2' to enable only op1 and op2 - 'none,+op1,+op2' to enable only op1 and op2
By default, all custom ops are enabled when running without Inductor and By default, all custom ops are enabled when running without Inductor and
disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
Inductor generates (fused) Triton kernels for disabled custom ops.""" Inductor generates (fused) Triton kernels for disabled custom ops."""
splitting_ops: list[str] | None = None splitting_ops: list[str] | None = None
"""A list of ops to exclude from cudagraphs, used in piecewise compilation. """A list of ops to exclude from cudagraphs, used in piecewise compilation.
...@@ -374,24 +373,6 @@ class CompilationConfig: ...@@ -374,24 +373,6 @@ class CompilationConfig:
Disabled by default until more models are supported/tested to work.""" Disabled by default until more models are supported/tested to work."""
# Inductor capture # Inductor capture
use_inductor: bool | None = None
"""
Whether to use inductor compilation.
This flag is deprecated and will be removed in the next release 0.12.0.
Please use the 'backend' option instead.
- False: inductor compilation is not used. graph runs in eager
(custom_ops enabled by default).
- True: inductor compilation is used (custom_ops disabled by default).
One graph for symbolic shape and one graph per size in compile_sizes
are compiled using configurations in inductor_compile_config.
This setting is ignored if mode<VLLM_COMPILE.
For future compatibility:
If use_inductor is True, backend="inductor" otherwise backend="eager".
"""
compile_sizes: list[int | str] | None = None compile_sizes: list[int | str] | None = None
"""Sizes to compile for inductor. In addition """Sizes to compile for inductor. In addition
to integers, it also supports "cudagraph_capture_sizes" to to integers, it also supports "cudagraph_capture_sizes" to
...@@ -759,14 +740,6 @@ class CompilationConfig: ...@@ -759,14 +740,6 @@ class CompilationConfig:
f"Invalid backend for piecewise compilation: {self.backend}" f"Invalid backend for piecewise compilation: {self.backend}"
) )
if self.use_inductor is not None:
logger.warning_once(
"The 'use_inductor' flag is deprecated and will be "
"removed in the next release (v0.12.0). "
"Please use the 'backend' option instead.",
)
self.backend = "inductor" if self.use_inductor else "eager"
if self.backend == "": if self.backend == "":
self.backend = current_platform.get_compile_backend() self.backend = current_platform.get_compile_backend()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment