[Perf] Disable inductor runtime asserts by default for serving perfor… (#37485)
Signed-off-by: tianrengao <terrygao87@gmail.com> Co-authored-by: Tianren Gao <tianren@fb.com>
This commit is contained in:
@@ -233,6 +233,26 @@ that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may
|
||||
produce an incorrect triton kernel. This may manifest as silent incorrectness,
|
||||
CUDA illegal memory accesses, or loud errors.
|
||||
|
||||
### Inductor runtime assertions
|
||||
|
||||
By default (on torch < 2.12), vLLM disables Inductor's runtime assertions
|
||||
(`assert_size_stride`, `assert_alignment`) to avoid ~2ms overhead per forward
|
||||
pass on large models. Setting `VLLM_LOGGING_LEVEL=DEBUG` automatically
|
||||
re-enables them so debugging sessions get full shape/stride validation:
|
||||
|
||||
```sh
|
||||
VLLM_LOGGING_LEVEL=DEBUG vllm serve <model>
|
||||
```
|
||||
|
||||
You can also override them explicitly via `--compilation-config`:
|
||||
|
||||
```sh
|
||||
vllm serve <model> -cc.inductor_compile_config='{"size_asserts": true, "alignment_asserts": true, "scalar_asserts": true}'
|
||||
```
|
||||
|
||||
On torch >= 2.12, PyTorch uses an efficient assert-once strategy and these
|
||||
flags are no longer suppressed by vLLM.
|
||||
|
||||
To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'`
|
||||
to the compilation config:
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from contextlib import nullcontext
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from pydantic import ValidationError
|
||||
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
@@ -612,3 +613,58 @@ def test_adjust_cudagraph_sizes_for_mamba_cache(
|
||||
# Invariant: last element == max_cudagraph_capture_size
|
||||
if expected_sizes:
|
||||
assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size
|
||||
|
||||
|
||||
def test_inductor_asserts_default_disabled(monkeypatch):
|
||||
"""Test that inductor runtime asserts are disabled by default
|
||||
(INFO logging level) on torch < 2.12."""
|
||||
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
|
||||
|
||||
import importlib
|
||||
|
||||
import vllm.envs
|
||||
|
||||
importlib.reload(vllm.envs)
|
||||
|
||||
config = CompilationConfig()
|
||||
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
|
||||
assert config.inductor_compile_config.get("size_asserts") is False
|
||||
assert config.inductor_compile_config.get("alignment_asserts") is False
|
||||
assert config.inductor_compile_config.get("scalar_asserts") is False
|
||||
|
||||
|
||||
def test_inductor_asserts_enabled_in_debug(monkeypatch):
|
||||
"""Test that VLLM_LOGGING_LEVEL=DEBUG enables inductor runtime asserts
|
||||
on torch < 2.12."""
|
||||
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "DEBUG")
|
||||
|
||||
import importlib
|
||||
|
||||
import vllm.envs
|
||||
|
||||
importlib.reload(vllm.envs)
|
||||
|
||||
config = CompilationConfig()
|
||||
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
|
||||
assert config.inductor_compile_config.get("size_asserts") is True
|
||||
assert config.inductor_compile_config.get("alignment_asserts") is True
|
||||
assert config.inductor_compile_config.get("scalar_asserts") is True
|
||||
|
||||
|
||||
def test_inductor_asserts_user_override(monkeypatch):
|
||||
"""Test that explicit inductor_compile_config overrides the
|
||||
debug-logging default."""
|
||||
monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO")
|
||||
|
||||
import importlib
|
||||
|
||||
import vllm.envs
|
||||
|
||||
importlib.reload(vllm.envs)
|
||||
|
||||
config = CompilationConfig(
|
||||
inductor_compile_config={"size_asserts": True},
|
||||
)
|
||||
assert config.inductor_compile_config.get("size_asserts") is True
|
||||
if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"):
|
||||
assert config.inductor_compile_config.get("alignment_asserts") is False
|
||||
|
||||
@@ -858,6 +858,25 @@ class CompilationConfig:
|
||||
if KEY not in self.inductor_compile_config:
|
||||
self.inductor_compile_config[KEY] = False
|
||||
|
||||
# Tie inductor runtime assertions to debug logging mode.
|
||||
# These assertions add ~2ms overhead per forward pass on large
|
||||
# models (e.g., DeepSeek-R1 671B: ~340 assert_size_stride + ~60
|
||||
# assert_alignment calls per forward). PyTorch >= 2.12 has a
|
||||
# native fix (assert-once), so we only apply this workaround on
|
||||
# older versions. On torch < 2.12, enable asserts only when
|
||||
# VLLM_LOGGING_LEVEL=DEBUG. Users can still override explicitly
|
||||
# via --compilation-config '{"inductor_compile_config":
|
||||
# {"size_asserts": true, ...}}'.
|
||||
# See: https://github.com/pytorch/pytorch/issues/177719
|
||||
if not is_torch_equal_or_newer("2.12.0.dev"):
|
||||
enable_asserts = envs.VLLM_LOGGING_LEVEL == "DEBUG"
|
||||
for key in (
|
||||
"size_asserts",
|
||||
"alignment_asserts",
|
||||
"scalar_asserts",
|
||||
):
|
||||
self.inductor_compile_config.setdefault(key, enable_asserts)
|
||||
|
||||
for k, v in self.inductor_passes.items():
|
||||
if not isinstance(v, str):
|
||||
assert callable(v), f"pass {k} should be callable or a qualified name"
|
||||
|
||||
Reference in New Issue
Block a user