[Core] Move multimodal placeholder from chat utils to model definition (#20355)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-07-03 16:18:30 +08:00
committed by GitHub
parent cb97f2bfc5
commit b024a42e93
54 changed files with 396 additions and 155 deletions

View File

@@ -835,6 +835,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"model.": "language_model.model.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|video_pad|><|vision_end|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config