[bugfix] do not add extra linebreak for score/rerank with chat template (#38617)

Signed-off-by: augusto.yjh <augusto.yjh@antgroup.com> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>

[bugfix] do not add extra linebreak for score/rerank with chat template (#38617)
Signed-off-by: augusto.yjh <augusto.yjh@antgroup.com> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
ef53395e · Augusto Yao · GitHub · eb474549 · ef53395e · ef53395e
Unverified Commit ef53395e authored Apr 01, 2026 by Augusto Yao Committed by GitHub Apr 01, 2026
3 changed files
--- a/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
+++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
@@ -234,7 +234,7 @@ async def test_score_api_queries_str_documents_image_url_plus_text_content(
    assert score.id is not None
    assert score.data is not None
    assert len(score.data) == 1
-    assert score.usage.prompt_tokens == 108
+    assert score.usage.prompt_tokens == 107
    assert_score(
        score.data[0].score, TEXT_VS_TEXT_PLUS_IMAGE, backend, "text_vs_text_plus_image"
    )
@@ -264,7 +264,7 @@ async def test_score_api_queries_str_documents_list(
    assert score.id is not None
    assert score.data is not None
    assert len(score.data) == 4
-    assert score.usage.prompt_tokens == 368
+    assert score.usage.prompt_tokens == 367
    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "list[0]_text_vs_text")
    assert_score(score.data[1].score, TEXT_VS_TEXT, backend, "list[1]_text_vs_text")
    assert_score(score.data[2].score, TEXT_VS_IMAGE, backend, "list[2]_text_vs_image")
@@ -353,7 +353,7 @@ async def test_score_api_queries_list_documents_list(
    assert score.id is not None
    assert score.data is not None
    assert len(score.data) == 4
-    assert score.usage.prompt_tokens == 368
+    assert score.usage.prompt_tokens == 367
    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "paired[0]_text_vs_text")
    assert_score(score.data[1].score, TEXT_VS_TEXT, backend, "paired[1]_text_vs_text")
    assert_score(score.data[2].score, TEXT_VS_IMAGE, backend, "paired[2]_text_vs_image")

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1187,6 +1187,7 @@ def _get_full_multimodal_text_prompt(
    placeholder_storage: dict[str, list],
    texts: list[str],
    interleave_strings: bool,
+    multimodal_content_part_separator: str = "\n",
 ) -> str:
    """Combine multimodal prompts for a multimodal language model."""
@@ -1232,9 +1233,11 @@ def _get_full_multimodal_text_prompt(
    # NOTE: Default behaviour: we always add missing placeholders
    # at the front of the prompt, if interleave_strings=False
    if text_prompt:
-        return "\n".join(missing_placeholders + [text_prompt])
+        return multimodal_content_part_separator.join(
+            missing_placeholders + [text_prompt]
+        )
    else:
-        return "\n".join(missing_placeholders)
+        return multimodal_content_part_separator.join(missing_placeholders)
 # No need to validate using Pydantic again
@@ -1384,6 +1387,7 @@ def _parse_chat_message_content_parts(
    wrap_dicts: bool,
    interleave_strings: bool,
    mm_processor_kwargs: dict[str, Any] | None = None,
+    multimodal_content_part_separator="\n",
 ) -> list[ConversationMessage]:
    content = list[_ContentPart]()
@@ -1406,7 +1410,10 @@ def _parse_chat_message_content_parts(
    mm_placeholder_storage = mm_parser.mm_placeholder_storage()
    if mm_placeholder_storage:
        text_prompt = _get_full_multimodal_text_prompt(
-            mm_placeholder_storage, texts, interleave_strings
+            mm_placeholder_storage,
+            texts,
+            interleave_strings,
+            multimodal_content_part_separator=multimodal_content_part_separator,
        )
    else:
        text_prompt = "\n".join(texts)

--- a/vllm/entrypoints/pooling/scoring/utils.py
+++ b/vllm/entrypoints/pooling/scoring/utils.py
@@ -150,6 +150,7 @@ def _parse_score_content(
        mm_tracker=mm_tracker,
        wrap_dicts=False,
        interleave_strings=False,
+        multimodal_content_part_separator="",
    )
    if parse_res: