[Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)

Signed-off-by: NickLucche <nlucches@redhat.com>
2026-01-22 11:50:37 +01:00
parent 328cbb2773
commit ea6102b85d
2 changed files with 54 additions and 5 deletions
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
    def __init__(self, cache_size: int):
        self.cache_size = cache_size
        self.num_free_slots = cache_size
-        self.freed: list[str] = []
+        self.allocated: list[str] = []
+        self.to_free: list[str] = []

    def check_and_update_cache(self, request: Request, input_id: int) -> bool:
        return False
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
        self.num_free_slots -= num_encoder_embeds

        mm_hash = request.mm_features[input_id].identifier
-        self.freed.append(mm_hash)
+        self.allocated.append(mm_hash)

    def free(self, request: Request) -> None:
        for input_id in range(len(request.mm_features)):
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
        return set(range(len(request.mm_features)))

    def get_freed_mm_hashes(self) -> list[str]:
-        freed = self.freed
-        self.freed = []
-        return freed
+        # As encoder cache is not used for enc-dec models, we can free the entries here
+        # The actual free happens in the runner, *before* the model is executed.
+        # Therefore, `freeable` acts as a buffer to free the entries only after the
+        # model is executed, mimicking the state transition of `EncoderCacheManager`.
+        to_free = self.to_free
+        self.to_free = self.allocated
+        self.allocated = []
+        return to_free

    def free_encoder_input(self, request: Request, input_id: int) -> None:
        num_encoder_embeds = request.get_num_encoder_embeds(input_id)