[Multimodal] Always enable hashing mm data (#23308)
Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -17,7 +17,6 @@ from vllm.multimodal.utils import argsort_mm_positions
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
from vllm.utils import is_list_of
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
|
||||
from vllm.v1.structured_output.backend_guidance import (
|
||||
@@ -253,13 +252,10 @@ class Processor:
|
||||
# 1. Tokenize text prompt, with LoRA request if one exists.
|
||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||
# multimodal data and expand prompt token ids accordingly.
|
||||
return_mm_hashes = (self.model_config.processor_return_mm_hashes
|
||||
or bool(self.cache_config.enable_prefix_caching))
|
||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.validate_request(
|
||||
@@ -302,7 +298,7 @@ class Processor:
|
||||
if decoder_inputs["type"] == "multimodal":
|
||||
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
||||
decoder_mm_positions = decoder_inputs["mm_placeholders"]
|
||||
decoder_mm_hashes = decoder_inputs.get("mm_hashes")
|
||||
decoder_mm_hashes = decoder_inputs["mm_hashes"]
|
||||
|
||||
# Merge and flatten multimodal placeholders, hashes and inputs
|
||||
# from dictionaries to lists, and sort them by each item's position
|
||||
@@ -317,19 +313,15 @@ class Processor:
|
||||
decoder_mm_positions[modality][idx]
|
||||
for modality, idx in sorted_mm_idxs
|
||||
]
|
||||
sorted_mm_hashes = None if decoder_mm_hashes is None else [
|
||||
sorted_mm_hashes = [
|
||||
decoder_mm_hashes[modality][idx]
|
||||
for modality, idx in sorted_mm_idxs
|
||||
]
|
||||
|
||||
if sorted_mm_hashes is not None:
|
||||
sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
|
||||
orig_sorted_mm_inputs,
|
||||
sorted_mm_hashes,
|
||||
)
|
||||
else:
|
||||
assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem)
|
||||
sorted_mm_inputs = orig_sorted_mm_inputs
|
||||
sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
|
||||
orig_sorted_mm_inputs,
|
||||
sorted_mm_hashes,
|
||||
)
|
||||
|
||||
return decoder_inputs.get("prompt"), EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
|
||||
Reference in New Issue
Block a user