[BugFix] VLLM_DISABLE_COMPILE_CACHE=1 should disable all reads and writes from the cache (#20942)

Signed-off-by: Richard Zou <zou3519@gmail.com>
This commit is contained in:
Richard Zou
2025-07-14 21:26:18 -04:00
committed by GitHub
parent 8cdc371217
commit ba8c300018
4 changed files with 33 additions and 2 deletions

View File

@@ -26,6 +26,30 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert not vllm_config.compilation_config.use_cudagraph
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
# on the state of the cache directory on the current machine, which
# may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
assert vllm.envs.VLLM_USE_V1
# spawn means that the counters are in the same process.
monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
compilation_config = {
"use_cudagraph": False, # speed things up a bit
}
with (
compilation_counter.expect(num_cache_entries_updated=0,
num_compiled_artifacts_saved=0),
# loading the model causes compilation (if enabled) to happen
vllm_runner('facebook/opt-125m',
compilation_config=compilation_config,
gpu_memory_utilization=0.4) as _):
pass
@pytest.mark.parametrize("enabled", [True, False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1