[BugFix][Frontend] Fix LLM.chat() tokenization (#16081)

Signed-off-by: Nick Hill <nhill@redhat.com>
2025-04-25 15:20:05 -07:00
parent 65e262b93b
commit 70116459c3
2 changed files with 43 additions and 14 deletions
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -89,3 +89,31 @@ def test_chat_multi_image(image_urls: list[str]):
    }]
    outputs = llm.chat(messages)
    assert len(outputs) >= 0
+
+
+def test_llm_chat_tokenization_no_double_bos():
+    """
+    LLM.chat() should not add special tokens when using chat templates.
+    Check we get a single BOS token for llama chat.
+    """
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True)
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello!"
+        },
+    ]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 1
+    prompt_token_ids = getattr(outputs[0], "prompt_token_ids", None)
+    assert prompt_token_ids is not None
+
+    bos_token = llm.get_tokenizer().bos_token_id
+
+    # Ensure we have a single BOS
+    assert prompt_token_ids[0] == bos_token
+    assert prompt_token_ids[1] != bos_token, "Double BOS"