[Core] Store only the keys for multi-modal data in P0 (#22198)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-08-07 16:45:04 +08:00
committed by GitHub
parent 289b18e670
commit 766bc8162c
17 changed files with 325 additions and 234 deletions

View File

@@ -19,7 +19,7 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
from vllm.v1.structured_output.backend_guidance import (
validate_guidance_grammar)
from vllm.v1.structured_output.backend_outlines import (
@@ -50,11 +50,8 @@ class Processor:
self.tokenizer,
mm_registry)
self.mm_input_cache_client = MirroredProcessingCache(self.model_config)
# Multi-modal hasher (for images)
self.use_hash = self.mm_input_cache_client.use_cache or \
self.cache_config.enable_prefix_caching
self.mm_input_cache_client = MultiModalInputCacheClient(
self.model_config)
@property
def mm_registry(self):
@@ -256,11 +253,13 @@ class Processor:
# 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly.
return_mm_hashes = (self.model_config.processor_return_mm_hashes
or bool(self.cache_config.enable_prefix_caching))
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
return_mm_hashes=self.use_hash,
return_mm_hashes=return_mm_hashes,
)
from vllm.platforms import current_platform
current_platform.validate_request(
@@ -312,7 +311,7 @@ class Processor:
sorted_mm_hashes,
) = merge_and_sort_multimodal_metadata(
decoder_inputs["mm_placeholders"],
decoder_inputs["mm_hashes"] if self.use_hash else None,
decoder_inputs["mm_hashes"] if return_mm_hashes else None,
)
# The output of merged multi-modal processor (`decoder_mm_inputs`)
@@ -339,7 +338,7 @@ class Processor:
]
if sorted_mm_hashes is not None:
sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0(
sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
orig_sorted_mm_inputs, sorted_mm_hashes)
else:
sorted_mm_inputs = orig_sorted_mm_inputs