[Bugfix] Fix more multimodal tests for transformers V5 (#34334)

Signed-off-by: raushan <raushan@huggingface.co>
This commit is contained in:
Raushan Turganbay
2026-02-11 22:02:05 +01:00
committed by GitHub
parent 5458eb835d
commit 527ca32197
5 changed files with 18 additions and 11 deletions

View File

@@ -108,6 +108,7 @@ _ADD_SPECIAL_TOKENS_OVERRIDES = {
"paligemma": False,
"ultravox": False,
"whisper": False,
"lfm2_vl": False,
}
_IGNORE_MM_KEYS = {

View File

@@ -810,9 +810,9 @@ class GlmAsrMultiModalProcessor(BaseMultiModalProcessor["GlmAsrProcessingInfo"])
# Postprocess: rename mask and add chunk counts
# Handle different key names from different transformers versions
if "input_feature_mask" in outputs:
outputs["feature_attention_mask"] = outputs.pop("input_feature_mask")
elif "feature_attention_mask" not in outputs and "input_features" in outputs:
if "input_features_mask" in outputs:
outputs["feature_attention_mask"] = outputs.pop("input_features_mask")
elif "input_features_mask" not in outputs and "input_features" in outputs:
# If no mask is provided, create one from input_features
input_features = outputs["input_features"]
if isinstance(input_features, torch.Tensor):

View File

@@ -18,8 +18,8 @@ def _calculate_conv_output_length(
input_length: torch.Tensor, padding: int, kernel_size: int, stride: int
) -> torch.Tensor:
"""Calculate Conv1d output length using standard formula."""
# Standard formula: floor((input + 2*padding - kernel_size) / stride) + 1
return (input_length + 2 * padding - kernel_size) // stride + 1
# in sync with `hf_processor._get_audio_token_length`
return (input_length + 2 * padding - (kernel_size - 1) - 1) // stride + 1
def _as_list_chunk_counts(

View File

@@ -347,7 +347,9 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
) -> BatchFeature:
# Text-only input not supported in composite processor
if not (images := mm_data.get("images", [])):
prompt_ids = self.info.get_tokenizer().encode(prompt)
prompt_ids = self.info.get_tokenizer().encode(
prompt, add_special_tokens=False
)
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")

View File

@@ -1467,15 +1467,15 @@ class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
class Tarsier2Processor(Qwen2VLProcessor):
def __init__(
self,
vision_config: dict,
image_processor: Tarsier2ImageProcessor,
tokenizer: TokenizerLike,
video_processor: Qwen2VLVideoProcessor,
**kwargs,
):
self.image_processor = Tarsier2ImageProcessor(**vision_config)
super().__init__(
image_processor=self.image_processor,
image_processor=image_processor,
tokenizer=tokenizer,
video_processor=Qwen2VLVideoProcessor(**vision_config),
video_processor=video_processor,
chat_template=None,
**kwargs,
)
@@ -1489,8 +1489,12 @@ class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
return correct_config
def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor:
vision_config = self.ctx.get_hf_image_processor_config()
image_processor = Tarsier2ImageProcessor(**vision_config)
video_processor = Qwen2VLVideoProcessor(**vision_config)
return Tarsier2Processor(
vision_config=self.ctx.get_hf_image_processor_config(),
image_processor=image_processor,
video_processor=video_processor,
tokenizer=self.get_tokenizer(),
**kwargs,
)