[Chore] Deprecate merge_by_field_config arg (#30035)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
NestedTensors,
|
||||
)
|
||||
from vllm.multimodal.parse import (
|
||||
@@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ class SupportsMultiModal(Protocol):
|
||||
`multimodal_config.mm_encoder_tp_mode="data"`.
|
||||
"""
|
||||
|
||||
merge_by_field_config: ClassVar[bool] = False
|
||||
merge_by_field_config: ClassVar[bool] = True
|
||||
"""
|
||||
A flag that indicates which implementation of
|
||||
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
|
||||
|
||||
@@ -28,7 +28,7 @@ from vllm.model_executor.models.utils import (
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
@@ -103,7 +103,7 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
@@ -52,7 +52,6 @@ from vllm.multimodal.evs import (
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
VideoItem,
|
||||
)
|
||||
@@ -849,17 +848,18 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
if "image_num_patches" in out_mm_kwargs:
|
||||
image_num_patches = out_mm_kwargs["image_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "image_num_patches" in out_mm_data:
|
||||
image_num_patches = out_mm_data["image_num_patches"]
|
||||
assert isinstance(image_num_patches, torch.Tensor)
|
||||
image_num_patches = image_num_patches.tolist()
|
||||
elif "image_embeds" in out_mm_kwargs:
|
||||
elif "image_embeds" in out_mm_data:
|
||||
# to compute num_patches (similar to Qwen2-VL)
|
||||
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
|
||||
image_num_patches = [None] * len(out_mm_data["image_embeds"])
|
||||
else:
|
||||
image_num_patches = []
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
|
||||
from vllm.multimodal.processing import (
|
||||
@@ -153,7 +153,7 @@ class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo])
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@@ -62,7 +62,7 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFeatureSpec,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
@@ -307,7 +307,7 @@ class PaddleOCRVLMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
|
||||
hf_config = self.info.get_hf_config()
|
||||
|
||||
@@ -40,7 +40,6 @@ from .siglip import SiglipVisionModel
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
WeightsMapper,
|
||||
flatten_bn,
|
||||
init_vllm_registered_model,
|
||||
maybe_prefix,
|
||||
)
|
||||
@@ -252,6 +251,8 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
|
||||
dummy_inputs=PaliGemmaDummyInputsBuilder,
|
||||
)
|
||||
class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
merge_by_field_config = True
|
||||
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
@@ -327,9 +328,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
|
||||
return None
|
||||
|
||||
if pixel_values is not None:
|
||||
pixel_values = flatten_bn(pixel_values, concat=True)
|
||||
|
||||
h = w = self.config.vision_config.image_size
|
||||
|
||||
return PaliGemmaImagePixelInputs(
|
||||
type="pixel_values",
|
||||
data=pixel_values,
|
||||
@@ -337,8 +337,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
|
||||
)
|
||||
|
||||
if image_embeds is not None:
|
||||
image_embeds = flatten_bn(image_embeds, concat=True)
|
||||
|
||||
return PaliGemmaImageEmbeddingInputs(
|
||||
type="image_embeds",
|
||||
data=image_embeds,
|
||||
|
||||
@@ -77,7 +77,7 @@ from vllm.multimodal.evs import (
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFeatureSpec,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
||||
@@ -973,7 +973,7 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
Reference in New Issue
Block a user