[Model] Move multimodal_cpu_fields definition to field config (#30181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-12-06 21:40:02 +08:00
committed by GitHub
parent 21bb323542
commit 671427efbf
15 changed files with 141 additions and 95 deletions

View File

@@ -811,14 +811,14 @@ def _create_qwen2vl_field_factory(
image_embeds=MultiModalFieldConfig.flat_from_sizes(
"image", image_embed_grid_sizes
),
image_grid_thw=MultiModalFieldConfig.batched("image"),
image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
"video", video_grid_sizes
),
video_embeds=MultiModalFieldConfig.flat_from_sizes(
"video", video_embed_grid_sizes
),
video_grid_thw=MultiModalFieldConfig.batched("video"),
video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
)
return _qwen2vl_field_config
@@ -1131,8 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo])
class Qwen2VLForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
):
multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
@@ -1393,9 +1391,11 @@ class Qwen2VLForConditionalGeneration(
else:
pixel_values_videos = video_input["pixel_values_videos"]
if self.use_data_parallel:
grid_thw_list = grid_thw.tolist()
return run_dp_sharded_mrope_vision_model(
self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
self.visual,
pixel_values_videos,
grid_thw.tolist(),
rope_type="rope_3d",
)
else:
video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)