diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md index af4a9ea10..fbee9f4c3 100644 --- a/docs/design/debug_vllm_compile.md +++ b/docs/design/debug_vllm_compile.md @@ -233,6 +233,26 @@ that may call 1+ triton kernels. On rare (but unfortunate) occasions, it may produce an incorrect triton kernel. This may manifest as silent incorrectness, CUDA illegal memory accesses, or loud errors. +### Inductor runtime assertions + +By default (on torch < 2.12), vLLM disables Inductor's runtime assertions +(`assert_size_stride`, `assert_alignment`) to avoid ~2ms overhead per forward +pass on large models. Setting `VLLM_LOGGING_LEVEL=DEBUG` automatically +re-enables them so debugging sessions get full shape/stride validation: + +```sh +VLLM_LOGGING_LEVEL=DEBUG vllm serve +``` + +You can also override them explicitly via `--compilation-config`: + +```sh +vllm serve -cc.inductor_compile_config='{"size_asserts": true, "alignment_asserts": true, "scalar_asserts": true}' +``` + +On torch >= 2.12, PyTorch uses an efficient assert-once strategy and these +flags are no longer suppressed by vLLM. + To debug if TorchInductor is at fault, you can disable it by passing `backend='eager'` to the compilation config: diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index c22a4be50..6b8f3b60b 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -5,6 +5,7 @@ from contextlib import nullcontext from unittest.mock import MagicMock, patch import pytest +import torch from pydantic import ValidationError from vllm.compilation.counter import compilation_counter @@ -612,3 +613,58 @@ def test_adjust_cudagraph_sizes_for_mamba_cache( # Invariant: last element == max_cudagraph_capture_size if expected_sizes: assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size + + +def test_inductor_asserts_default_disabled(monkeypatch): + """Test that inductor runtime asserts are disabled by default + (INFO logging level) on torch < 2.12.""" + monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO") + + import importlib + + import vllm.envs + + importlib.reload(vllm.envs) + + config = CompilationConfig() + if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"): + assert config.inductor_compile_config.get("size_asserts") is False + assert config.inductor_compile_config.get("alignment_asserts") is False + assert config.inductor_compile_config.get("scalar_asserts") is False + + +def test_inductor_asserts_enabled_in_debug(monkeypatch): + """Test that VLLM_LOGGING_LEVEL=DEBUG enables inductor runtime asserts + on torch < 2.12.""" + monkeypatch.setenv("VLLM_LOGGING_LEVEL", "DEBUG") + + import importlib + + import vllm.envs + + importlib.reload(vllm.envs) + + config = CompilationConfig() + if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"): + assert config.inductor_compile_config.get("size_asserts") is True + assert config.inductor_compile_config.get("alignment_asserts") is True + assert config.inductor_compile_config.get("scalar_asserts") is True + + +def test_inductor_asserts_user_override(monkeypatch): + """Test that explicit inductor_compile_config overrides the + debug-logging default.""" + monkeypatch.setenv("VLLM_LOGGING_LEVEL", "INFO") + + import importlib + + import vllm.envs + + importlib.reload(vllm.envs) + + config = CompilationConfig( + inductor_compile_config={"size_asserts": True}, + ) + assert config.inductor_compile_config.get("size_asserts") is True + if not _is_torch_equal_or_newer(torch.__version__, "2.12.0.dev"): + assert config.inductor_compile_config.get("alignment_asserts") is False diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 1c102582f..9e4196a44 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -858,6 +858,25 @@ class CompilationConfig: if KEY not in self.inductor_compile_config: self.inductor_compile_config[KEY] = False + # Tie inductor runtime assertions to debug logging mode. + # These assertions add ~2ms overhead per forward pass on large + # models (e.g., DeepSeek-R1 671B: ~340 assert_size_stride + ~60 + # assert_alignment calls per forward). PyTorch >= 2.12 has a + # native fix (assert-once), so we only apply this workaround on + # older versions. On torch < 2.12, enable asserts only when + # VLLM_LOGGING_LEVEL=DEBUG. Users can still override explicitly + # via --compilation-config '{"inductor_compile_config": + # {"size_asserts": true, ...}}'. + # See: https://github.com/pytorch/pytorch/issues/177719 + if not is_torch_equal_or_newer("2.12.0.dev"): + enable_asserts = envs.VLLM_LOGGING_LEVEL == "DEBUG" + for key in ( + "size_asserts", + "alignment_asserts", + "scalar_asserts", + ): + self.inductor_compile_config.setdefault(key, enable_asserts) + for k, v in self.inductor_passes.items(): if not isinstance(v, str): assert callable(v), f"pass {k} should be callable or a qualified name"