[Core] Automatically cast multi-modal input dtype (#18756)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-27 23:45:48 +08:00
parent 6b6d496114
commit 696259ca01
16 changed files with 91 additions and 44 deletions
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -210,9 +210,7 @@ class DeepseekVL2MultiModalProcessor(
                dict(prompt=prompt, **mm_data),
                mm_kwargs,
            )
-            target_dtype = self.info.ctx.model_config.dtype
-            pixel_values = processed_outputs.pop("pixel_values").to(
-                target_dtype)
+            pixel_values = processed_outputs["pixel_values"]
            # split pixel values into patches corresponding to each image
            images_spatial_crop = processed_outputs["images_spatial_crop"]
            patches_per_image = [
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -263,11 +263,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
            mm_data,
            mm_kwargs,
        )
-        if "pixel_values" in processed_outputs:
-            # Cast pixel values to model dtype already here,
-            # so we need to transfer less data to the GPU
-            processed_outputs["pixel_values"] = processed_outputs[
-                "pixel_values"].to(self.info.ctx.model_config.dtype)

        # HF processor pops the `num_crops` kwarg, which is needed by vLLM
        if (images := mm_data.get("images")) is not None: