[Core] Automatically cast multi-modal input dtype (#18756)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-05-27 23:45:48 +08:00
committed by GitHub
parent 6b6d496114
commit 696259ca01
16 changed files with 91 additions and 44 deletions

View File

@@ -210,9 +210,7 @@ class DeepseekVL2MultiModalProcessor(
dict(prompt=prompt, **mm_data),
mm_kwargs,
)
target_dtype = self.info.ctx.model_config.dtype
pixel_values = processed_outputs.pop("pixel_values").to(
target_dtype)
pixel_values = processed_outputs["pixel_values"]
# split pixel values into patches corresponding to each image
images_spatial_crop = processed_outputs["images_spatial_crop"]
patches_per_image = [