[Metrics] Log multi-modal cache stats and fix reset (#26285)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-10 16:45:55 +08:00
parent 6f0f570c43
commit ad430a67ca
25 changed files with 586 additions and 235 deletions
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -19,6 +19,7 @@ from vllm.multimodal.inputs import (
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.jsontree import json_iter_leaves
+from vllm.v1.metrics.stats import MultiModalCacheStats

 from .data import (
    DecoderOnlyInputs,
@@ -56,6 +57,8 @@ class InputPreprocessor:
        self.mm_registry = mm_registry
        self.mm_processor_cache = mm_processor_cache

+        self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None
+
    def get_tokenizer(self) -> AnyTokenizer:
        if self.tokenizer is None:
            raise ValueError(
@@ -664,14 +667,13 @@ class InputPreprocessor:

        return self._build_decoder_only_llm_inputs(prompt_comps)

-    def preprocess(
+    def _preprocess(
        self,
        prompt: PromptType,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        *,
        mm_uuids: Optional[MultiModalUUIDDict] = None,
    ) -> ProcessorInputs:
-        """Preprocess the input prompt."""
        if self.model_config.is_encoder_decoder:
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder.
@@ -694,6 +696,40 @@ class InputPreprocessor:
            mm_uuids=mm_uuids,
        )

-    def clear_cache(self) -> None:
+    def preprocess(
+        self,
+        prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        *,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
+    ) -> ProcessorInputs:
+        """Preprocess the input prompt."""
+        res = self._preprocess(
+            prompt,
+            tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+
+        if self.mm_processor_cache and self.mm_cache_stats is not None:
+            delta = self.mm_processor_cache.make_stats(delta=True)
+            self.mm_cache_stats.requests += 1
+            self.mm_cache_stats.queries += delta.total
+            self.mm_cache_stats.hits += delta.hits
+
+        return res
+
+    def stat_mm_cache(self) -> Optional[MultiModalCacheStats]:
+        mm_cache_stats = self.mm_cache_stats
+        if mm_cache_stats is None:
+            return None
+
+        self.mm_cache_stats = MultiModalCacheStats()
+
+        return mm_cache_stats
+
+    def clear_mm_cache(self) -> None:
        if self.mm_processor_cache is not None:
            self.mm_processor_cache.clear_cache()
+
+        if self.mm_cache_stats is not None:
+            self.mm_cache_stats.reset = True