[Perf] Disable inductor runtime asserts by default for serving perfor… (#37485)

Signed-off-by: tianrengao <terrygao87@gmail.com> Co-authored-by: Tianren Gao <tianren@fb.com>

[Perf] Disable inductor runtime asserts by default for serving perfor… (#37485)
Signed-off-by: tianrengao <terrygao87@gmail.com> Co-authored-by: Tianren Gao <tianren@fb.com>
82580b10 · Terry Gao · GitHub · a0d487b2 · 82580b10 · 82580b10
Unverified Commit 82580b10 authored Mar 24, 2026 by Terry Gao Committed by GitHub Mar 24, 2026
Showing with 95 additions and 0 deletions

docs/design/debug_vllm_compile.md docs/design/debug_vllm_compile.md +20 -0

tests/compile/test_config.py tests/compile/test_config.py +56 -0

vllm/config/compilation.py vllm/config/compilation.py +19 -0

No files found.
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -233,6 +233,26 @@ that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may
 produce an incorrect triton kernel. This may manifest as silent incorrectness,
 CUDA illegal memory accesses, or loud errors.
+### Inductor runtime assertions
+By default (on torch < 2.12), vLLM disables Inductor's runtime assertions
+(`assert_size_stride`, `assert_alignment`) to avoid ~2ms overhead per forward
+pass on large models. Setting `VLLM_LOGGING_LEVEL=DEBUG` automatically
+re-enables them so debugging sessions get full shape/stride validation:
+```sh
+VLLM_LOGGING_LEVEL=DEBUG vllm serve <model>
+```
+You can also override them explicitly via `--compilation-config`:
+```sh
+vllm serve <model> -cc.inductor_compile_config='{"size_asserts": true, "alignment_asserts": true, "scalar_asserts": true}'
+```
+On torch >= 2.12, PyTorch uses an efficient assert-once strategy and these
+flags are no longer suppressed by vLLM.
 To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'`
 to the compilation config:

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -5,6 +5,7 @@ from contextlib import nullcontext
 from unittest.mock import MagicMock, patch
 import pytest
+import torch
 from pydantic import ValidationError
 from vllm.compilation.counter import compilation_counter
@@ -612,3 +613,58 @@ def test_adjust_cudagraph_sizes_for_mamba_cache(
    # Invariant: last element == max_cudagraph_capture_size
    if expected_sizes:
        assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size
+def test_inductor_asserts_default_disabled(monkeypatch):
+    """Test that inductor runtime asserts are disabled by default
+    (INFO logging level) on torch < 2.12."""
+    monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
+    import importlib
+    import vllm.envs
+    importlib.reload(vllm.envs)
+    config = CompilationConfig()
+    if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
+        assert config.inductor_compile_config.get("size_asserts") is False
+        assert config.inductor_compile_config.get("alignment_asserts") is False
+        assert config.inductor_compile_config.get("scalar_asserts") is False
+def test_inductor_asserts_enabled_in_debug(monkeypatch):
+    """Test that VLLM_LOGGING_LEVEL=DEBUG enables inductor runtime asserts
+    on torch < 2.12."""
+    monkeypatch.setenv("VLLM_LOGGING_LEVEL", "DEBUG")
+    import importlib
+    import vllm.envs
+    importlib.reload(vllm.envs)
+    config = CompilationConfig()
+    if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
+        assert config.inductor_compile_config.get("size_asserts") is True
+        assert config.inductor_compile_config.get("alignment_asserts") is True
+        assert config.inductor_compile_config.get("scalar_asserts") is True
+def test_inductor_asserts_user_override(monkeypatch):
+    """Test that explicit inductor_compile_config overrides the
+    debug-logging default."""
+    monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
+    import importlib
+    import vllm.envs
+    importlib.reload(vllm.envs)
+    config = CompilationConfig(
+        inductor_compile_config={"size_asserts": True},
+    )
+    assert config.inductor_compile_config.get("size_asserts") is True
+    if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
+        assert config.inductor_compile_config.get("alignment_asserts") is False
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -858,6 +858,25 @@ class CompilationConfig:
        if KEY not in self.inductor_compile_config:
            self.inductor_compile_config[KEY] = False
+        # Tie inductor runtime assertions to debug logging mode.
+        # These assertions add ~2ms overhead per forward pass on large
+        # models (e.g., DeepSeek-R1 671B: ~340 assert_size_stride + ~60
+        # assert_alignment calls per forward). PyTorch >= 2.12 has a
+        # native fix (assert-once), so we only apply this workaround on
+        # older versions. On torch < 2.12, enable asserts only when
+        # VLLM_LOGGING_LEVEL=DEBUG. Users can still override explicitly
+        # via --compilation-config '{"inductor_compile_config":
+        # {"size_asserts": true, ...}}'.
+        # See: https://github.com/pytorch/pytorch/issues/177719
+        if not is_torch_equal_or_newer("2.12.0.dev"):
+            enable_asserts = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+            for key in (
+                "size_asserts",
+                "alignment_asserts",
+                "scalar_asserts",
+            ):
+                self.inductor_compile_config.setdefault(key, enable_asserts)
        for k, v in self.inductor_passes.items():
            if not isinstance(v, str):
                assert callable(v), f"pass {k} should be callable or a qualified name"