[Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)
Signed-off-by: NickLucche <nlucches@redhat.com>
(cherry picked from commit ea6102b85d)
This commit is contained in:
@@ -176,3 +176,46 @@ def test_models_distributed(
|
|||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=False,
|
enforce_eager=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.core_model
|
||||||
|
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||||
|
def test_encoder_cache_cleanup(
|
||||||
|
vllm_runner,
|
||||||
|
model: str,
|
||||||
|
input_audios,
|
||||||
|
monkeypatch,
|
||||||
|
) -> None:
|
||||||
|
"""Test that encoder cache is properly cleaned up after requests complete.
|
||||||
|
|
||||||
|
This is a regression test for a bug where encoder cache entries were freed
|
||||||
|
in the same scheduling step they were allocated, before the model could use
|
||||||
|
them.
|
||||||
|
"""
|
||||||
|
# Set single-process mode to access the model runner's encoder cache directly
|
||||||
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
check_model_available(model)
|
||||||
|
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
dtype="half",
|
||||||
|
max_model_len=448,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
limit_mm_per_prompt={"audio": 2},
|
||||||
|
enforce_eager=True,
|
||||||
|
) as vllm_model:
|
||||||
|
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
|
||||||
|
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
|
||||||
|
encoder_cache = model_runner.encoder_cache
|
||||||
|
|
||||||
|
# Run multiple sequential requests to ensure cache is properly managed
|
||||||
|
for vllm_prompts, _, audios in input_audios:
|
||||||
|
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
|
||||||
|
|
||||||
|
# After all requests complete, encoder cache should be empty
|
||||||
|
cache_size = len(encoder_cache)
|
||||||
|
assert cache_size == 0, (
|
||||||
|
f"Encoder cache should be empty after all requests complete, "
|
||||||
|
f"but has {cache_size} entries. This indicates encoder cache "
|
||||||
|
f"entries are not being properly freed."
|
||||||
|
)
|
||||||
|
|||||||
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
|||||||
def __init__(self, cache_size: int):
|
def __init__(self, cache_size: int):
|
||||||
self.cache_size = cache_size
|
self.cache_size = cache_size
|
||||||
self.num_free_slots = cache_size
|
self.num_free_slots = cache_size
|
||||||
self.freed: list[str] = []
|
self.allocated: list[str] = []
|
||||||
|
self.to_free: list[str] = []
|
||||||
|
|
||||||
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
|
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
|
||||||
return False
|
return False
|
||||||
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
|||||||
self.num_free_slots -= num_encoder_embeds
|
self.num_free_slots -= num_encoder_embeds
|
||||||
|
|
||||||
mm_hash = request.mm_features[input_id].identifier
|
mm_hash = request.mm_features[input_id].identifier
|
||||||
self.freed.append(mm_hash)
|
self.allocated.append(mm_hash)
|
||||||
|
|
||||||
def free(self, request: Request) -> None:
|
def free(self, request: Request) -> None:
|
||||||
for input_id in range(len(request.mm_features)):
|
for input_id in range(len(request.mm_features)):
|
||||||
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
|||||||
return set(range(len(request.mm_features)))
|
return set(range(len(request.mm_features)))
|
||||||
|
|
||||||
def get_freed_mm_hashes(self) -> list[str]:
|
def get_freed_mm_hashes(self) -> list[str]:
|
||||||
freed = self.freed
|
# As encoder cache is not used for enc-dec models, we can free the entries here
|
||||||
self.freed = []
|
# The actual free happens in the runner, *before* the model is executed.
|
||||||
return freed
|
# Therefore, `freeable` acts as a buffer to free the entries only after the
|
||||||
|
# model is executed, mimicking the state transition of `EncoderCacheManager`.
|
||||||
|
to_free = self.to_free
|
||||||
|
self.to_free = self.allocated
|
||||||
|
self.allocated = []
|
||||||
|
return to_free
|
||||||
|
|
||||||
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
||||||
num_encoder_embeds = request.get_num_encoder_embeds(input_id)
|
num_encoder_embeds = request.get_num_encoder_embeds(input_id)
|
||||||
|
|||||||
Reference in New Issue
Block a user