diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index cf45fb9fe..08bb13e95 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -346,6 +346,13 @@ class BagelForConditionalGeneration( } ) + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return "<|image_pad|>" + + raise ValueError("Only image modality is supported") + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/transformers_utils/processors/bagel.py b/vllm/transformers_utils/processors/bagel.py index 850e64f2f..7f7a0fd9e 100644 --- a/vllm/transformers_utils/processors/bagel.py +++ b/vllm/transformers_utils/processors/bagel.py @@ -4,6 +4,7 @@ """BAGEL processor for image and text inputs.""" from transformers import AutoProcessor +from transformers.feature_extraction_utils import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils_base import PreTokenizedInput, TextInput @@ -44,12 +45,16 @@ class BagelProcessor(ProcessorMixin): text_inputs = self.tokenizer(text, **kwargs) if text is not None else None if pixel_values is not None and text_inputs is not None: - text_inputs["pixel_values"] = pixel_values["pixel_values"] - return text_inputs + # Combine text and image inputs into BatchFeature + combined = dict(text_inputs) + combined["pixel_values"] = pixel_values["pixel_values"] + return BatchFeature(combined) elif pixel_values is not None: return pixel_values + elif text_inputs is not None: + return BatchFeature(dict(text_inputs)) else: - return text_inputs + return BatchFeature({}) def batch_decode(self, *args, **kwargs): """