[Metrics] Log multi-modal cache stats and fix reset (#26285)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-10 16:45:55 +08:00
parent 6f0f570c43
commit ad430a67ca
25 changed files with 586 additions and 235 deletions
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -21,6 +21,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.metrics.stats import MultiModalCacheStats
 from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
    validate_structured_output_request_lm_format_enforcer,
@@ -573,5 +574,8 @@ class Processor:
            # check that chunked prefill does not truncate them
            # max_batch_len = self.scheduler_config.max_num_batched_tokens

-    def clear_cache(self) -> None:
-        self.input_preprocessor.clear_cache()
+    def stat_mm_cache(self) -> Optional[MultiModalCacheStats]:
+        return self.input_preprocessor.stat_mm_cache()
+
+    def clear_mm_cache(self) -> None:
+        self.input_preprocessor.clear_mm_cache()