[Model] Use merge_by_field_config for MM models (G) (#26117)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-03 13:38:29 +08:00
parent 711f485643
commit 39b643dc1a
5 changed files with 56 additions and 108 deletions
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -43,7 +43,6 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .chatglm import ChatGLMBaseModel, ChatGLMModel
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                         SupportsMultiModal, SupportsPP)
-from .utils import flatten_bn


 class GLMVImagePixelInputs(TensorSchema):
@@ -529,8 +528,9 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
                                        info=GLM4VProcessingInfo,
                                        dummy_inputs=GLM4VDummyInputsBuilder)
-class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
-                       SupportsMultiModal):
+class GLM4VForCausalLM(ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA,
+                       SupportsPP):
+    merge_by_field_config = True

    packed_modules_mapping = {
        "query_key_value": ["query_key_value"],
@@ -574,14 +574,9 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
        pixel_values = kwargs.pop("pixel_values", None)

        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
            expected_h = expected_w = self.config.vision_config["image_size"]
            return GLMVImagePixelInputs(type="pixel_values",
-                                        data=flatten_bn(pixel_values,
-                                                        concat=True),
+                                        data=pixel_values,
                                        resolve_bindings={
                                            "h": expected_h,
                                            "w": expected_w
@@ -598,6 +593,8 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
    def get_language_model(self) -> torch.nn.Module:
        return self.transformer

+    get_input_embeddings = SupportsMultiModal.get_input_embeddings
+
    def get_multimodal_embeddings(self,
                                  **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)