Unverified Commit eaa2e510 authored by Richard Zou's avatar Richard Zou Committed by GitHub
Browse files

[Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)


Signed-off-by: default avatarRichard Zou <zou3519@gmail.com>
parent d77f7fb8
......@@ -95,7 +95,7 @@ def _test_simple_piecewise_compile(*, use_inductor):
num_piecewise_graphs_seen=5, # 2 * num_layers + 1
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
num_cudagraph_caputured=
num_cudagraph_captured=
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
......
......@@ -327,7 +327,7 @@ def _test_toy_llama(*, use_inductor):
num_piecewise_graphs_seen=0,
num_piecewise_capturable_graphs_seen=0,
num_backend_compilations=0,
num_cudagraph_caputured=0,
num_cudagraph_captured=0,
):
outputs.append(
run_model(llama_config, use_inductor=False, use_compile=False))
......@@ -343,7 +343,7 @@ def _test_toy_llama(*, use_inductor):
num_piecewise_graphs_seen=1,
num_piecewise_capturable_graphs_seen=1,
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
num_cudagraph_caputured=
num_cudagraph_captured=
2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
**kwargs,
):
......@@ -361,7 +361,7 @@ def _test_toy_llama(*, use_inductor):
llama_config.num_layers, # 1 + num_layers
num_backend_compilations=1 +
llama_config.num_layers, # num_piecewise_capturable_graphs_seen
num_cudagraph_caputured=2 *
num_cudagraph_captured=2 *
(1 + llama_config.num_layers
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import vllm
from vllm.compilation.counter import compilation_counter
from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
set_current_vllm_config)
from .piecewise.test_simple import SillyModel
@pytest.fixture(scope="function", autouse=True)
def use_v1(monkeypatch):
"""
TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now,
I'll switch them over later.
"""
monkeypatch.setenv('VLLM_USE_V1', '1')
@pytest.mark.parametrize("enabled", [True, False])
def test_use_cudagraphs(enabled):
assert vllm.envs.VLLM_USE_V1
vllm_config = VllmConfig(compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=enabled,
cudagraph_capture_sizes=[100],
))
with set_current_vllm_config(vllm_config):
model = SillyModel(vllm_config=vllm_config, prefix='')
inputs = torch.randn(100, device="cuda")
with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
num_cudagraph_captured=1 if enabled else 0,
):
# first run is warmup
model(inputs)
# second run does CUDAGraphs recording (if enabled)
model(inputs)
......@@ -15,7 +15,7 @@ class CompilationCounter:
# not including the splitting ops
num_piecewise_capturable_graphs_seen: int = 0
num_backend_compilations: int = 0
num_cudagraph_caputured: int = 0
num_cudagraph_captured: int = 0
# InductorAdapter.compile calls
num_inductor_compiles: int = 0
# EagerAdapter.compile calls
......
......@@ -193,7 +193,7 @@ class CUDAPiecewiseBackend:
entry.output = weak_ref_tensors(output)
entry.cudagraph = cudagraph
compilation_counter.num_cudagraph_caputured += 1
compilation_counter.num_cudagraph_captured += 1
# important: we need to return the output, rather than
# the weak ref of the output, so that pytorch can correctly
......
......@@ -3918,12 +3918,14 @@ class CompilationConfig:
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
# CudaGraph compilation
use_cudagraph: bool = False
use_cudagraph: bool = envs.VLLM_USE_V1
"""Whether to use cudagraph inside compilation.
- False: cudagraph inside compilation is not used.
- True: cudagraph inside compilation is used. It requires
that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
In the vLLM V1 Engine, this flag only applies for
CompilationLevel.PIECEWISE (aka -O3).
Note that this is orthogonal to the cudagraph capture logic
outside of compilation.
TODO: move outside cudagraph logic into compilation.
......@@ -4425,7 +4427,6 @@ class VllmConfig:
# FIXME(rob): Add function to set all of these.
if not self.compilation_config.custom_ops:
self.compilation_config.custom_ops = ["none"]
self.compilation_config.use_cudagraph = True
self.compilation_config.cudagraph_num_of_warmups = 1
self.compilation_config.pass_config.enable_fusion = False
self.compilation_config.pass_config.enable_noop = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment