[Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)

Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
Nicolò Lucchesi
2026-01-22 11:50:37 +01:00
committed by GitHub
parent 328cbb2773
commit ea6102b85d
2 changed files with 54 additions and 5 deletions

View File

@@ -176,3 +176,46 @@ def test_models_distributed(
distributed_executor_backend=distributed_executor_backend,
enforce_eager=False,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
def test_encoder_cache_cleanup(
vllm_runner,
model: str,
input_audios,
monkeypatch,
) -> None:
"""Test that encoder cache is properly cleaned up after requests complete.
This is a regression test for a bug where encoder cache entries were freed
in the same scheduling step they were allocated, before the model could use
them.
"""
# Set single-process mode to access the model runner's encoder cache directly
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
check_model_available(model)
with vllm_runner(
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=1,
limit_mm_per_prompt={"audio": 2},
enforce_eager=True,
) as vllm_model:
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
encoder_cache = model_runner.encoder_cache
# Run multiple sequential requests to ensure cache is properly managed
for vllm_prompts, _, audios in input_audios:
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
# After all requests complete, encoder cache should be empty
cache_size = len(encoder_cache)
assert cache_size == 0, (
f"Encoder cache should be empty after all requests complete, "
f"but has {cache_size} entries. This indicates encoder cache "
f"entries are not being properly freed."
)

View File

@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
def __init__(self, cache_size: int):
self.cache_size = cache_size
self.num_free_slots = cache_size
self.freed: list[str] = []
self.allocated: list[str] = []
self.to_free: list[str] = []
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
return False
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
self.num_free_slots -= num_encoder_embeds
mm_hash = request.mm_features[input_id].identifier
self.freed.append(mm_hash)
self.allocated.append(mm_hash)
def free(self, request: Request) -> None:
for input_id in range(len(request.mm_features)):
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
return set(range(len(request.mm_features)))
def get_freed_mm_hashes(self) -> list[str]:
freed = self.freed
self.freed = []
return freed
# As encoder cache is not used for enc-dec models, we can free the entries here
# The actual free happens in the runner, *before* the model is executed.
# Therefore, `freeable` acts as a buffer to free the entries only after the
# model is executed, mimicking the state transition of `EncoderCacheManager`.
to_free = self.to_free
self.to_free = self.allocated
self.allocated = []
return to_free
def free_encoder_input(self, request: Request, input_id: int) -> None:
num_encoder_embeds = request.get_num_encoder_embeds(input_id)