[Core] Automatically cast multi-modal input dtype (#18756)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-05-27 23:45:48 +08:00
committed by GitHub
parent 6b6d496114
commit 696259ca01
16 changed files with 91 additions and 44 deletions

View File

@@ -263,11 +263,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
mm_data,
mm_kwargs,
)
if "pixel_values" in processed_outputs:
# Cast pixel values to model dtype already here,
# so we need to transfer less data to the GPU
processed_outputs["pixel_values"] = processed_outputs[
"pixel_values"].to(self.info.ctx.model_config.dtype)
# HF processor pops the `num_crops` kwarg, which is needed by vLLM
if (images := mm_data.get("images")) is not None: