[Model] Use merge_by_field_config for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-06 17:45:26 +08:00
committed by GitHub
parent 391612e78b
commit 19a00eb210
9 changed files with 155 additions and 229 deletions

View File

@@ -64,7 +64,7 @@ from vllm.transformers_utils.tokenizer import (
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix
from .utils import init_vllm_registered_model, maybe_prefix
from .vision import (
VisionEncoderInfo,
VisionFeatureSelectStrategy,
@@ -365,6 +365,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
dummy_inputs=PixtralDummyInputsBuilder,
)
class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
@@ -424,7 +426,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
return PixtralImagePixelInputs(
type="pixel_values",
images=flatten_bn(images),
images=images,
)
def _process_image_input(