[Bugfix] Fix more multimodal tests for transformers V5 (#34334)
Signed-off-by: raushan <raushan@huggingface.co>
This commit is contained in:
committed by
GitHub
parent
5458eb835d
commit
527ca32197
@@ -108,6 +108,7 @@ _ADD_SPECIAL_TOKENS_OVERRIDES = {
|
||||
"paligemma": False,
|
||||
"ultravox": False,
|
||||
"whisper": False,
|
||||
"lfm2_vl": False,
|
||||
}
|
||||
|
||||
_IGNORE_MM_KEYS = {
|
||||
|
||||
@@ -810,9 +810,9 @@ class GlmAsrMultiModalProcessor(BaseMultiModalProcessor["GlmAsrProcessingInfo"])
|
||||
|
||||
# Postprocess: rename mask and add chunk counts
|
||||
# Handle different key names from different transformers versions
|
||||
if "input_feature_mask" in outputs:
|
||||
outputs["feature_attention_mask"] = outputs.pop("input_feature_mask")
|
||||
elif "feature_attention_mask" not in outputs and "input_features" in outputs:
|
||||
if "input_features_mask" in outputs:
|
||||
outputs["feature_attention_mask"] = outputs.pop("input_features_mask")
|
||||
elif "input_features_mask" not in outputs and "input_features" in outputs:
|
||||
# If no mask is provided, create one from input_features
|
||||
input_features = outputs["input_features"]
|
||||
if isinstance(input_features, torch.Tensor):
|
||||
|
||||
@@ -18,8 +18,8 @@ def _calculate_conv_output_length(
|
||||
input_length: torch.Tensor, padding: int, kernel_size: int, stride: int
|
||||
) -> torch.Tensor:
|
||||
"""Calculate Conv1d output length using standard formula."""
|
||||
# Standard formula: floor((input + 2*padding - kernel_size) / stride) + 1
|
||||
return (input_length + 2 * padding - kernel_size) // stride + 1
|
||||
# in sync with `hf_processor._get_audio_token_length`
|
||||
return (input_length + 2 * padding - (kernel_size - 1) - 1) // stride + 1
|
||||
|
||||
|
||||
def _as_list_chunk_counts(
|
||||
|
||||
@@ -347,7 +347,9 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
|
||||
) -> BatchFeature:
|
||||
# Text-only input not supported in composite processor
|
||||
if not (images := mm_data.get("images", [])):
|
||||
prompt_ids = self.info.get_tokenizer().encode(prompt)
|
||||
prompt_ids = self.info.get_tokenizer().encode(
|
||||
prompt, add_special_tokens=False
|
||||
)
|
||||
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
|
||||
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
|
||||
|
||||
|
||||
@@ -1467,15 +1467,15 @@ class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
|
||||
class Tarsier2Processor(Qwen2VLProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: dict,
|
||||
image_processor: Tarsier2ImageProcessor,
|
||||
tokenizer: TokenizerLike,
|
||||
video_processor: Qwen2VLVideoProcessor,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_processor = Tarsier2ImageProcessor(**vision_config)
|
||||
super().__init__(
|
||||
image_processor=self.image_processor,
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
video_processor=Qwen2VLVideoProcessor(**vision_config),
|
||||
video_processor=video_processor,
|
||||
chat_template=None,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -1489,8 +1489,12 @@ class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
|
||||
return correct_config
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor:
|
||||
vision_config = self.ctx.get_hf_image_processor_config()
|
||||
image_processor = Tarsier2ImageProcessor(**vision_config)
|
||||
video_processor = Qwen2VLVideoProcessor(**vision_config)
|
||||
return Tarsier2Processor(
|
||||
vision_config=self.ctx.get_hf_image_processor_config(),
|
||||
image_processor=image_processor,
|
||||
video_processor=video_processor,
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user