Unverified Commit 82580b10 authored by Terry Gao's avatar Terry Gao Committed by GitHub
Browse files

[Perf] Disable inductor runtime asserts by default for serving perfor… (#37485)


Signed-off-by: default avatartianrengao <terrygao87@gmail.com>
Co-authored-by: default avatarTianren Gao <tianren@fb.com>
parent a0d487b2
...@@ -233,6 +233,26 @@ that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may ...@@ -233,6 +233,26 @@ that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may
produce an incorrect triton kernel. This may manifest as silent incorrectness, produce an incorrect triton kernel. This may manifest as silent incorrectness,
CUDA illegal memory accesses, or loud errors. CUDA illegal memory accesses, or loud errors.
### Inductor runtime assertions
By default (on torch < 2.12), vLLM disables Inductor's runtime assertions
(`assert_size_stride`, `assert_alignment`) to avoid ~2ms overhead per forward
pass on large models. Setting `VLLM_LOGGING_LEVEL=DEBUG` automatically
re-enables them so debugging sessions get full shape/stride validation:
```sh
VLLM_LOGGING_LEVEL=DEBUG vllm serve <model>
```
You can also override them explicitly via `--compilation-config`:
```sh
vllm serve <model> -cc.inductor_compile_config='{"size_asserts": true, "alignment_asserts": true, "scalar_asserts": true}'
```
On torch >= 2.12, PyTorch uses an efficient assert-once strategy and these
flags are no longer suppressed by vLLM.
To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'` To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'`
to the compilation config: to the compilation config:
......
...@@ -5,6 +5,7 @@ from contextlib import nullcontext ...@@ -5,6 +5,7 @@ from contextlib import nullcontext
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import pytest import pytest
import torch
from pydantic import ValidationError from pydantic import ValidationError
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
...@@ -612,3 +613,58 @@ def test_adjust_cudagraph_sizes_for_mamba_cache( ...@@ -612,3 +613,58 @@ def test_adjust_cudagraph_sizes_for_mamba_cache(
# Invariant: last element == max_cudagraph_capture_size # Invariant: last element == max_cudagraph_capture_size
if expected_sizes: if expected_sizes:
assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size
def test_inductor_asserts_default_disabled(monkeypatch):
"""Test that inductor runtime asserts are disabled by default
(INFO logging level) on torch < 2.12."""
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
import importlib
import vllm.envs
importlib.reload(vllm.envs)
config = CompilationConfig()
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
assert config.inductor_compile_config.get("size_asserts") is False
assert config.inductor_compile_config.get("alignment_asserts") is False
assert config.inductor_compile_config.get("scalar_asserts") is False
def test_inductor_asserts_enabled_in_debug(monkeypatch):
"""Test that VLLM_LOGGING_LEVEL=DEBUG enables inductor runtime asserts
on torch < 2.12."""
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "DEBUG")
import importlib
import vllm.envs
importlib.reload(vllm.envs)
config = CompilationConfig()
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
assert config.inductor_compile_config.get("size_asserts") is True
assert config.inductor_compile_config.get("alignment_asserts") is True
assert config.inductor_compile_config.get("scalar_asserts") is True
def test_inductor_asserts_user_override(monkeypatch):
"""Test that explicit inductor_compile_config overrides the
debug-logging default."""
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
import importlib
import vllm.envs
importlib.reload(vllm.envs)
config = CompilationConfig(
inductor_compile_config={"size_asserts": True},
)
assert config.inductor_compile_config.get("size_asserts") is True
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
assert config.inductor_compile_config.get("alignment_asserts") is False
...@@ -858,6 +858,25 @@ class CompilationConfig: ...@@ -858,6 +858,25 @@ class CompilationConfig:
if KEY not in self.inductor_compile_config: if KEY not in self.inductor_compile_config:
self.inductor_compile_config[KEY] = False self.inductor_compile_config[KEY] = False
# Tie inductor runtime assertions to debug logging mode.
# These assertions add ~2ms overhead per forward pass on large
# models (e.g., DeepSeek-R1 671B: ~340 assert_size_stride + ~60
# assert_alignment calls per forward). PyTorch >= 2.12 has a
# native fix (assert-once), so we only apply this workaround on
# older versions. On torch < 2.12, enable asserts only when
# VLLM_LOGGING_LEVEL=DEBUG. Users can still override explicitly
# via --compilation-config '{"inductor_compile_config":
# {"size_asserts": true, ...}}'.
# See: https://github.com/pytorch/pytorch/issues/177719
if not is_torch_equal_or_newer("2.12.0.dev"):
enable_asserts = envs.VLLM_LOGGING_LEVEL == "DEBUG"
for key in (
"size_asserts",
"alignment_asserts",
"scalar_asserts",
):
self.inductor_compile_config.setdefault(key, enable_asserts)
for k, v in self.inductor_passes.items(): for k, v in self.inductor_passes.items():
if not isinstance(v, str): if not isinstance(v, str):
assert callable(v), f"pass {k} should be callable or a qualified name" assert callable(v), f"pass {k} should be callable or a qualified name"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment