[Refactor] Use data parser for matching data items to multi-modal UUIDs (#32955)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -113,7 +113,11 @@ from .qwen2_5_vl import (
|
||||
Qwen2_5_VLVideoInputs,
|
||||
Qwen2_5_VLVideoPixelInputs,
|
||||
)
|
||||
from .qwen2_vl import Qwen2VLMultiModalDataParser, Qwen2VLProcessingInfo
|
||||
from .qwen2_vl import (
|
||||
Qwen2VLMultiModalDataParser,
|
||||
Qwen2VLProcessingInfo,
|
||||
_create_qwen2vl_field_factory,
|
||||
)
|
||||
from .qwen3 import Qwen3ForCausalLM, Qwen3Model
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
@@ -985,28 +989,9 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
|
||||
image_grid_sizes = image_grid_thw.prod(-1)
|
||||
|
||||
video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
|
||||
video_grid_sizes = video_grid_thw.prod(-1)
|
||||
|
||||
return dict(
|
||||
pixel_values=MultiModalFieldConfig.flat_from_sizes(
|
||||
"image", image_grid_sizes
|
||||
),
|
||||
image_embeds=MultiModalFieldConfig.flat_from_sizes(
|
||||
"image", image_grid_sizes
|
||||
),
|
||||
image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
|
||||
pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
|
||||
"video", video_grid_sizes
|
||||
),
|
||||
video_embeds=MultiModalFieldConfig.flat_from_sizes(
|
||||
"video", video_grid_sizes
|
||||
),
|
||||
video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
|
||||
)
|
||||
return _create_qwen2vl_field_factory(
|
||||
self.info.get_hf_config().vision_config.spatial_merge_size
|
||||
)(hf_inputs)
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user