[Core][MM] Optimize encoder cache manager by operating with embeddings only (#30475)

Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Sun Kim <sunytokki@gmail.com>
2025-12-16 14:18:17 -08:00
parent 9fec0e13d5
commit f5f51e5931
14 changed files with 306 additions and 130 deletions
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -209,10 +209,10 @@ class Request:
    def get_finished_reason(self) -> FinishReason | None:
        return RequestStatus.get_finished_reason(self.status)

-    def get_num_encoder_tokens(self, input_id: int) -> int:
+    def get_num_encoder_embeds(self, input_id: int) -> int:
        assert input_id < len(self.mm_features)
-        num_tokens = self.mm_features[input_id].mm_position.length
-        return num_tokens
+        num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
+        return num_embeds

    def record_event(
        self,