[Frontend][2/n] Make pooling entrypoints request schema consensus | ChatRequest (#32574)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-01-22 18:32:44 +08:00
parent 64e3d67ac0
commit 328cbb2773
24 changed files with 456 additions and 205 deletions
--- a/tests/entrypoints/pooling/classify/test_online.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@@ -167,7 +167,8 @@ def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):

@pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_add_special_tokens(server: RemoteOpenAIServer, model_name: str):
-    # FIXME: The add_special_tokens parameter doesn't seem to be working.
+    # The add_special_tokens parameter doesn't seem to be working with this model.
+    # working with papluca/xlm-roberta-base-language-detection
    response = requests.post(
        server.url_for("classify"),
        json={"model": model_name, "input": input_text, "add_special_tokens": False},
@@ -184,7 +185,110 @@ def test_add_special_tokens(server: RemoteOpenAIServer, model_name: str):


@pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_request(server: RemoteOpenAIServer, model_name: str):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
+
+    # test chat request basic usage
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+
+    response.raise_for_status()
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == MODEL_NAME
+    assert len(output.data) == 1
+    assert hasattr(output.data[0], "label")
+    assert hasattr(output.data[0], "probs")
+    assert output.usage.prompt_tokens == 51
+
+    # test add_generation_prompt
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "messages": messages, "add_generation_prompt": True},
+    )
+
+    response.raise_for_status()
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == MODEL_NAME
+    assert len(output.data) == 1
+    assert hasattr(output.data[0], "label")
+    assert hasattr(output.data[0], "probs")
+    assert output.usage.prompt_tokens == 54
+
+    # test continue_final_message
+    response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "continue_final_message": True,
+        },
+    )
+
+    response.raise_for_status()
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == MODEL_NAME
+    assert len(output.data) == 1
+    assert hasattr(output.data[0], "label")
+    assert hasattr(output.data[0], "probs")
+    assert output.usage.prompt_tokens == 49
+
+    # test add_special_tokens
+    # The add_special_tokens parameter doesn't seem to be working with this model.
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "messages": messages, "add_special_tokens": True},
+    )
+
+    response.raise_for_status()
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == MODEL_NAME
+    assert len(output.data) == 1
+    assert hasattr(output.data[0], "label")
+    assert hasattr(output.data[0], "probs")
+    assert output.usage.prompt_tokens == 51
+
+    # test continue_final_message with add_generation_prompt
+    response = requests.post(
+        server.url_for("classify"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "continue_final_message": True,
+            "add_generation_prompt": True,
+        },
+    )
+    assert (
+        "Cannot set both `continue_final_message` and `add_generation_prompt` to True."
+        in response.json()["error"]["message"]
+    )
+
+
+@pytest.mark.asyncio
+async def test_invocations_completion_request(server: RemoteOpenAIServer):
    request_args = {
        "model": MODEL_NAME,
        "input": input_text,
@@ -213,6 +317,48 @@ async def test_invocations(server: RemoteOpenAIServer):
        )


+@pytest.mark.asyncio
+async def test_invocations_chat_request(server: RemoteOpenAIServer):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
+
+    request_args = {"model": MODEL_NAME, "messages": messages}
+
+    classification_response = requests.post(
+        server.url_for("classify"), json=request_args
+    )
+    classification_response.raise_for_status()
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    classification_output = classification_response.json()
+    invocation_output = invocation_response.json()
+
+    assert classification_output.keys() == invocation_output.keys()
+    for classification_data, invocation_data in zip(
+        classification_output["data"], invocation_output["data"]
+    ):
+        assert classification_data.keys() == invocation_data.keys()
+        assert classification_data["probs"] == pytest.approx(
+            invocation_data["probs"], rel=0.01
+        )
+
+
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -214,64 +214,6 @@ async def test_completion_request_batched(
    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)


-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_conversation_embedding(
-    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
-):
-    messages = [
-        {
-            "role": "user",
-            "content": "The cat sat on the mat.",
-        },
-        {
-            "role": "assistant",
-            "content": "A feline was resting on a rug.",
-        },
-        {
-            "role": "user",
-            "content": "Stars twinkle brightly in the night sky.",
-        },
-    ]
-
-    chat_response = requests.post(
-        server.url_for("v1/embeddings"),
-        json={
-            "model": model_name,
-            "messages": messages,
-            "encoding_format": "float",
-        },
-    )
-    chat_response.raise_for_status()
-    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
-
-    tokenizer = get_tokenizer(tokenizer_name=model_name)
-    prompt = tokenizer.apply_chat_template(
-        messages,
-        chat_template=DUMMY_CHAT_TEMPLATE,
-        add_generation_prompt=True,
-        continue_final_message=False,
-        tokenize=False,
-    )
-    completion_response = await client.embeddings.create(
-        model=model_name,
-        input=prompt,
-        encoding_format="float",
-        # To be consistent with chat
-        extra_body={"add_special_tokens": False},
-    )
-    completion_embeddings = EmbeddingResponse.model_validate(
-        completion_response.model_dump(mode="json")
-    )
-
-    assert chat_embeddings.id is not None
-    assert completion_embeddings.id is not None
-    assert chat_embeddings.created <= completion_embeddings.created
-    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
-        completion_embeddings.model_dump(exclude={"id", "created"})
-    )
-
-
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_truncate_prompt_tokens(client: openai.AsyncOpenAI, model_name: str):
@@ -350,7 +292,129 @@ async def test_truncate_prompt_tokens(client: openai.AsyncOpenAI, model_name: st


@pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_request(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
+
+    # test chat request basic usage
+    chat_response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name)
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completion_response = await client.embeddings.create(
+        model=model_name,
+        input=prompt,
+        encoding_format="float",
+        # To be consistent with chat
+        extra_body={"add_special_tokens": False},
+    )
+    completion_embeddings = EmbeddingResponse.model_validate(
+        completion_response.model_dump(mode="json")
+    )
+
+    assert chat_embeddings.id is not None
+    assert completion_embeddings.id is not None
+    assert chat_embeddings.created <= completion_embeddings.created
+    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
+        completion_embeddings.model_dump(exclude={"id", "created"})
+    )
+
+    # test add_generation_prompt
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={"model": model_name, "messages": messages, "add_generation_prompt": True},
+    )
+
+    response.raise_for_status()
+    output = EmbeddingResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert output.usage.prompt_tokens == 34
+
+    # test continue_final_message
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "continue_final_message": True,
+        },
+    )
+
+    response.raise_for_status()
+    output = EmbeddingResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert output.usage.prompt_tokens == 33
+
+    # test add_special_tokens
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={"model": model_name, "messages": messages, "add_special_tokens": True},
+    )
+
+    response.raise_for_status()
+    output = EmbeddingResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert output.usage.prompt_tokens == 36
+
+    # test continue_final_message with add_generation_prompt
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "continue_final_message": True,
+            "add_generation_prompt": True,
+        },
+    )
+    assert (
+        "Cannot set both `continue_final_message` and `add_generation_prompt` to True."
+        in response.json()["error"]["message"]
+    )
+
+
+@pytest.mark.asyncio
+async def test_invocations_completion_request(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI
+):
    request_args = {
        "model": MODEL_NAME,
        "input": input_text,
@@ -381,7 +445,7 @@ async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenA


@pytest.mark.asyncio
-async def test_invocations_conversation(server: RemoteOpenAIServer):
+async def test_invocations_chat_request(server: RemoteOpenAIServer):
    messages = [
        {
            "role": "user",
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -138,7 +138,7 @@ def test_completion_request_batched(server: RemoteOpenAIServer, model_name: str)

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str):
+async def test_chat_request(server: RemoteOpenAIServer, model_name: str):
    messages = [
        {
            "role": "user",
@@ -154,6 +154,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
        },
    ]

+    # test chat request basic usage
    chat_response = requests.post(
        server.url_for("pooling"),
        json={
@@ -193,6 +194,68 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
        completion_poolings.model_dump(exclude={"id", "created"})
    )

+    # test add_generation_prompt
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "messages": messages, "add_generation_prompt": True},
+    )
+
+    response.raise_for_status()
+    output = PoolingResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert output.usage.prompt_tokens == 33
+
+    # test continue_final_message
+    # The continue_final_message parameter doesn't seem to be working with this model.
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "continue_final_message": True,
+        },
+    )
+
+    response.raise_for_status()
+    output = PoolingResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert output.usage.prompt_tokens == 33
+
+    # test add_special_tokens
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "messages": messages, "add_special_tokens": True},
+    )
+
+    response.raise_for_status()
+    output = PoolingResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert output.usage.prompt_tokens == 34
+
+    # test continue_final_message with add_generation_prompt
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "continue_final_message": True,
+            "add_generation_prompt": True,
+        },
+    )
+    assert (
+        "Cannot set both `continue_final_message` and `add_generation_prompt` to True."
+        in response.json()["error"]["message"]
+    )
+

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@@ -430,7 +493,7 @@ async def test_params_not_supported(


@pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer):
+async def test_invocations_chat_request(server: RemoteOpenAIServer):
    request_args = {
        "model": MODEL_NAME,
        "input": input_text,
@@ -462,7 +525,7 @@ async def test_invocations(server: RemoteOpenAIServer):


@pytest.mark.asyncio
-async def test_invocations_conversation(server: RemoteOpenAIServer):
+async def test_invocations_conversation_chat_request(server: RemoteOpenAIServer):
    messages = [
        {
            "role": "user",