[Perf] Disable inductor runtime asserts by default for serving perfor… (#37485)

Signed-off-by: tianrengao <terrygao87@gmail.com>
Co-authored-by: Tianren Gao <tianren@fb.com>
This commit is contained in:
Terry Gao
2026-03-24 16:37:51 -07:00
committed by GitHub
parent a0d487b2e1
commit 82580b10ac
3 changed files with 95 additions and 0 deletions

View File

@@ -233,6 +233,26 @@ that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may
produce an incorrect triton kernel. This may manifest as silent incorrectness,
CUDA illegal memory accesses, or loud errors.
### Inductor runtime assertions
By default (on torch < 2.12), vLLM disables Inductor's runtime assertions
(`assert_size_stride`, `assert_alignment`) to avoid ~2ms overhead per forward
pass on large models. Setting `VLLM_LOGGING_LEVEL=DEBUG` automatically
re-enables them so debugging sessions get full shape/stride validation:
```sh
VLLM_LOGGING_LEVEL=DEBUG vllm serve <model>
```
You can also override them explicitly via `--compilation-config`:
```sh
vllm serve <model> -cc.inductor_compile_config='{"size_asserts": true, "alignment_asserts": true, "scalar_asserts": true}'
```
On torch >= 2.12, PyTorch uses an efficient assert-once strategy and these
flags are no longer suppressed by vLLM.
To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'`
to the compilation config:

View File

@@ -5,6 +5,7 @@ from contextlib import nullcontext
from unittest.mock import MagicMock, patch
import pytest
import torch
from pydantic import ValidationError
from vllm.compilation.counter import compilation_counter
@@ -612,3 +613,58 @@ def test_adjust_cudagraph_sizes_for_mamba_cache(
# Invariant: last element == max_cudagraph_capture_size
if expected_sizes:
assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size
def test_inductor_asserts_default_disabled(monkeypatch):
"""Test that inductor runtime asserts are disabled by default
(INFO logging level) on torch < 2.12."""
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
import importlib
import vllm.envs
importlib.reload(vllm.envs)
config = CompilationConfig()
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
assert config.inductor_compile_config.get("size_asserts") is False
assert config.inductor_compile_config.get("alignment_asserts") is False
assert config.inductor_compile_config.get("scalar_asserts") is False
def test_inductor_asserts_enabled_in_debug(monkeypatch):
"""Test that VLLM_LOGGING_LEVEL=DEBUG enables inductor runtime asserts
on torch < 2.12."""
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "DEBUG")
import importlib
import vllm.envs
importlib.reload(vllm.envs)
config = CompilationConfig()
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
assert config.inductor_compile_config.get("size_asserts") is True
assert config.inductor_compile_config.get("alignment_asserts") is True
assert config.inductor_compile_config.get("scalar_asserts") is True
def test_inductor_asserts_user_override(monkeypatch):
"""Test that explicit inductor_compile_config overrides the
debug-logging default."""
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
import importlib
import vllm.envs
importlib.reload(vllm.envs)
config = CompilationConfig(
inductor_compile_config={"size_asserts": True},
)
assert config.inductor_compile_config.get("size_asserts") is True
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
assert config.inductor_compile_config.get("alignment_asserts") is False

View File

@@ -858,6 +858,25 @@ class CompilationConfig:
if KEY not in self.inductor_compile_config:
self.inductor_compile_config[KEY] = False
# Tie inductor runtime assertions to debug logging mode.
# These assertions add ~2ms overhead per forward pass on large
# models (e.g., DeepSeek-R1 671B: ~340 assert_size_stride + ~60
# assert_alignment calls per forward). PyTorch >= 2.12 has a
# native fix (assert-once), so we only apply this workaround on
# older versions. On torch < 2.12, enable asserts only when
# VLLM_LOGGING_LEVEL=DEBUG. Users can still override explicitly
# via --compilation-config '{"inductor_compile_config":
# {"size_asserts": true, ...}}'.
# See: https://github.com/pytorch/pytorch/issues/177719
if not is_torch_equal_or_newer("2.12.0.dev"):
enable_asserts = envs.VLLM_LOGGING_LEVEL == "DEBUG"
for key in (
"size_asserts",
"alignment_asserts",
"scalar_asserts",
):
self.inductor_compile_config.setdefault(key, enable_asserts)
for k, v in self.inductor_passes.items():
if not isinstance(v, str):
assert callable(v), f"pass {k} should be callable or a qualified name"