[Bugfix] Fix auto dtype casting for BatchFeature (#19316)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2025-06-14 23:13:08 +08:00
committed by GitHub
parent 6fa718a460
commit 2db9044ab6
7 changed files with 85 additions and 57 deletions

View File

@@ -965,9 +965,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
grid_thw_list = grid_thw.tolist()
if image_input["type"] == "image_embeds":
image_embeds = image_input["image_embeds"].type(self.visual.dtype)
image_embeds = image_input["image_embeds"]
else:
pixel_values = image_input["pixel_values"].type(self.visual.dtype)
pixel_values = image_input["pixel_values"]
image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
# Split concatenated embeddings for each image item.
@@ -985,10 +985,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
grid_thw_list = grid_thw.tolist()
if video_input["type"] == "video_embeds":
video_embeds = video_input["video_embeds"].type(self.visual.dtype)
video_embeds = video_input["video_embeds"]
else:
pixel_values_videos = video_input["pixel_values_videos"].type(
self.visual.dtype)
pixel_values_videos = video_input["pixel_values_videos"]
video_embeds = self.visual(pixel_values_videos,
grid_thw=grid_thw_list)