[Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-08-18 17:52:00 +08:00
committed by GitHub
parent 5c79b0d648
commit 27e8d1ea3e
77 changed files with 431 additions and 383 deletions

View File

@@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs)
MultiModalKwargsItems)
from vllm.multimodal.parse import ImageSize, MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, ProcessingCache,
@@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
placeholder = {
@@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor(
def get_replacement_hyperclovax(
item_idx: int,
modality: str,
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
):
num_tokens = None
out_item = out_mm_kwargs[modality][item_idx]
if modality == "image":
lens = out_item["vision_query_lengths_images"].data
num_tokens = self.info.get_num_image_tokens(
vision_query_length=out_mm_kwargs[
"vision_query_lengths_images"][item_idx], )
if modality == "video":
vision_query_length=lens)
elif modality == "video":
lens = out_item["vision_query_lengths_videos"].data
num_tokens = self.info.get_num_video_tokens(
vision_query_length=out_mm_kwargs[
"vision_query_lengths_videos"][item_idx], )
assert isinstance(num_tokens, int)
return [
placeholder[modality],
] * num_tokens
vision_query_length=lens)
else:
raise NotImplementedError(modality)
return [placeholder[modality]] * num_tokens
return [
PromptReplacement(