diff --git a/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py index f62ffb2bf..d773802a9 100644 --- a/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py +++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py @@ -234,7 +234,7 @@ async def test_score_api_queries_str_documents_image_url_plus_text_content( assert score.id is not None assert score.data is not None assert len(score.data) == 1 - assert score.usage.prompt_tokens == 108 + assert score.usage.prompt_tokens == 107 assert_score( score.data[0].score, TEXT_VS_TEXT_PLUS_IMAGE, backend, "text_vs_text_plus_image" ) @@ -264,7 +264,7 @@ async def test_score_api_queries_str_documents_list( assert score.id is not None assert score.data is not None assert len(score.data) == 4 - assert score.usage.prompt_tokens == 368 + assert score.usage.prompt_tokens == 367 assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "list[0]_text_vs_text") assert_score(score.data[1].score, TEXT_VS_TEXT, backend, "list[1]_text_vs_text") assert_score(score.data[2].score, TEXT_VS_IMAGE, backend, "list[2]_text_vs_image") @@ -353,7 +353,7 @@ async def test_score_api_queries_list_documents_list( assert score.id is not None assert score.data is not None assert len(score.data) == 4 - assert score.usage.prompt_tokens == 368 + assert score.usage.prompt_tokens == 367 assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "paired[0]_text_vs_text") assert_score(score.data[1].score, TEXT_VS_TEXT, backend, "paired[1]_text_vs_text") assert_score(score.data[2].score, TEXT_VS_IMAGE, backend, "paired[2]_text_vs_image") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 51e62042f..c1324d2f0 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1187,6 +1187,7 @@ def _get_full_multimodal_text_prompt( placeholder_storage: dict[str, list], texts: list[str], interleave_strings: bool, + multimodal_content_part_separator: str = "\n", ) -> str: """Combine multimodal prompts for a multimodal language model.""" @@ -1232,9 +1233,11 @@ def _get_full_multimodal_text_prompt( # NOTE: Default behaviour: we always add missing placeholders # at the front of the prompt, if interleave_strings=False if text_prompt: - return "\n".join(missing_placeholders + [text_prompt]) + return multimodal_content_part_separator.join( + missing_placeholders + [text_prompt] + ) else: - return "\n".join(missing_placeholders) + return multimodal_content_part_separator.join(missing_placeholders) # No need to validate using Pydantic again @@ -1384,6 +1387,7 @@ def _parse_chat_message_content_parts( wrap_dicts: bool, interleave_strings: bool, mm_processor_kwargs: dict[str, Any] | None = None, + multimodal_content_part_separator="\n", ) -> list[ConversationMessage]: content = list[_ContentPart]() @@ -1406,7 +1410,10 @@ def _parse_chat_message_content_parts( mm_placeholder_storage = mm_parser.mm_placeholder_storage() if mm_placeholder_storage: text_prompt = _get_full_multimodal_text_prompt( - mm_placeholder_storage, texts, interleave_strings + mm_placeholder_storage, + texts, + interleave_strings, + multimodal_content_part_separator=multimodal_content_part_separator, ) else: text_prompt = "\n".join(texts) diff --git a/vllm/entrypoints/pooling/scoring/utils.py b/vllm/entrypoints/pooling/scoring/utils.py index 7330e620f..812a75ab8 100644 --- a/vllm/entrypoints/pooling/scoring/utils.py +++ b/vllm/entrypoints/pooling/scoring/utils.py @@ -150,6 +150,7 @@ def _parse_score_content( mm_tracker=mm_tracker, wrap_dicts=False, interleave_strings=False, + multimodal_content_part_separator="", ) if parse_res: