From d8da76f3b7e48d8b8a5274e78e8190c9d0671175 Mon Sep 17 00:00:00 2001 From: Fanjiang Ye <96292363+Dylan1229@users.noreply.github.com> Date: Wed, 31 Dec 2025 16:46:10 -0600 Subject: [PATCH] [Bugfix] Fix BAGEL online serving for text and image understanding (#31546) Signed-off-by: Dylan1229 Signed-off-by: UED Signed-off-by: mr-ye-cao Co-authored-by: UED Co-authored-by: mr-ye-cao Co-authored-by: Mr-Ye-Cao <60802056+Mr-Ye-Cao@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/models/bagel.py | 7 +++++++ vllm/transformers_utils/processors/bagel.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index cf45fb9fe..08bb13e95 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -346,6 +346,13 @@ class BagelForConditionalGeneration( } ) + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return "<|image_pad|>" + + raise ValueError("Only image modality is supported") + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/transformers_utils/processors/bagel.py b/vllm/transformers_utils/processors/bagel.py index 850e64f2f..7f7a0fd9e 100644 --- a/vllm/transformers_utils/processors/bagel.py +++ b/vllm/transformers_utils/processors/bagel.py @@ -4,6 +4,7 @@ """BAGEL processor for image and text inputs.""" from transformers import AutoProcessor +from transformers.feature_extraction_utils import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils_base import PreTokenizedInput, TextInput @@ -44,12 +45,16 @@ class BagelProcessor(ProcessorMixin): text_inputs = self.tokenizer(text, **kwargs) if text is not None else None if pixel_values is not None and text_inputs is not None: - text_inputs["pixel_values"] = pixel_values["pixel_values"] - return text_inputs + # Combine text and image inputs into BatchFeature + combined = dict(text_inputs) + combined["pixel_values"] = pixel_values["pixel_values"] + return BatchFeature(combined) elif pixel_values is not None: return pixel_values + elif text_inputs is not None: + return BatchFeature(dict(text_inputs)) else: - return text_inputs + return BatchFeature({}) def batch_decode(self, *args, **kwargs): """