[Frontend][2/n] Make pooling entrypoints request schema consensus | ChatRequest (#32574)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
wang.yuqi
2026-01-22 18:32:44 +08:00
committed by GitHub
parent 64e3d67ac0
commit 328cbb2773
24 changed files with 456 additions and 205 deletions

View File

@@ -167,7 +167,8 @@ def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_add_special_tokens(server: RemoteOpenAIServer, model_name: str):
# FIXME: The add_special_tokens parameter doesn't seem to be working.
# The add_special_tokens parameter doesn't seem to be working with this model.
# working with papluca/xlm-roberta-base-language-detection
response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": input_text, "add_special_tokens": False},
@@ -184,7 +185,110 @@ def test_add_special_tokens(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_request(server: RemoteOpenAIServer, model_name: str):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
# test chat request basic usage
response = requests.post(
server.url_for("classify"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == MODEL_NAME
assert len(output.data) == 1
assert hasattr(output.data[0], "label")
assert hasattr(output.data[0], "probs")
assert output.usage.prompt_tokens == 51
# test add_generation_prompt
response = requests.post(
server.url_for("classify"),
json={"model": model_name, "messages": messages, "add_generation_prompt": True},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == MODEL_NAME
assert len(output.data) == 1
assert hasattr(output.data[0], "label")
assert hasattr(output.data[0], "probs")
assert output.usage.prompt_tokens == 54
# test continue_final_message
response = requests.post(
server.url_for("classify"),
json={
"model": model_name,
"messages": messages,
"continue_final_message": True,
},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == MODEL_NAME
assert len(output.data) == 1
assert hasattr(output.data[0], "label")
assert hasattr(output.data[0], "probs")
assert output.usage.prompt_tokens == 49
# test add_special_tokens
# The add_special_tokens parameter doesn't seem to be working with this model.
response = requests.post(
server.url_for("classify"),
json={"model": model_name, "messages": messages, "add_special_tokens": True},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == MODEL_NAME
assert len(output.data) == 1
assert hasattr(output.data[0], "label")
assert hasattr(output.data[0], "probs")
assert output.usage.prompt_tokens == 51
# test continue_final_message with add_generation_prompt
response = requests.post(
server.url_for("classify"),
json={
"model": model_name,
"messages": messages,
"continue_final_message": True,
"add_generation_prompt": True,
},
)
assert (
"Cannot set both `continue_final_message` and `add_generation_prompt` to True."
in response.json()["error"]["message"]
)
@pytest.mark.asyncio
async def test_invocations_completion_request(server: RemoteOpenAIServer):
request_args = {
"model": MODEL_NAME,
"input": input_text,
@@ -213,6 +317,48 @@ async def test_invocations(server: RemoteOpenAIServer):
)
@pytest.mark.asyncio
async def test_invocations_chat_request(server: RemoteOpenAIServer):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
request_args = {"model": MODEL_NAME, "messages": messages}
classification_response = requests.post(
server.url_for("classify"), json=request_args
)
classification_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
classification_output = classification_response.json()
invocation_output = invocation_response.json()
assert classification_output.keys() == invocation_output.keys()
for classification_data, invocation_data in zip(
classification_output["data"], invocation_output["data"]
):
assert classification_data.keys() == invocation_data.keys()
assert classification_data["probs"] == pytest.approx(
invocation_data["probs"], rel=0.01
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):

View File

@@ -214,64 +214,6 @@ async def test_completion_request_batched(
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
chat_response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json")
)
assert chat_embeddings.id is not None
assert completion_embeddings.id is not None
assert chat_embeddings.created <= completion_embeddings.created
assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
completion_embeddings.model_dump(exclude={"id", "created"})
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_truncate_prompt_tokens(client: openai.AsyncOpenAI, model_name: str):
@@ -350,7 +292,129 @@ async def test_truncate_prompt_tokens(client: openai.AsyncOpenAI, model_name: st
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_request(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
messages = [
{
"role": "user",
"content": "The cat sat on the mat.",
},
{
"role": "assistant",
"content": "A feline was resting on a rug.",
},
{
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
},
]
# test chat request basic usage
chat_response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name)
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json")
)
assert chat_embeddings.id is not None
assert completion_embeddings.id is not None
assert chat_embeddings.created <= completion_embeddings.created
assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
completion_embeddings.model_dump(exclude={"id", "created"})
)
# test add_generation_prompt
response = requests.post(
server.url_for("v1/embeddings"),
json={"model": model_name, "messages": messages, "add_generation_prompt": True},
)
response.raise_for_status()
output = EmbeddingResponse.model_validate(response.json())
assert output.object == "list"
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert output.usage.prompt_tokens == 34
# test continue_final_message
response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"continue_final_message": True,
},
)
response.raise_for_status()
output = EmbeddingResponse.model_validate(response.json())
assert output.object == "list"
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert output.usage.prompt_tokens == 33
# test add_special_tokens
response = requests.post(
server.url_for("v1/embeddings"),
json={"model": model_name, "messages": messages, "add_special_tokens": True},
)
response.raise_for_status()
output = EmbeddingResponse.model_validate(response.json())
assert output.object == "list"
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert output.usage.prompt_tokens == 36
# test continue_final_message with add_generation_prompt
response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"continue_final_message": True,
"add_generation_prompt": True,
},
)
assert (
"Cannot set both `continue_final_message` and `add_generation_prompt` to True."
in response.json()["error"]["message"]
)
@pytest.mark.asyncio
async def test_invocations_completion_request(
server: RemoteOpenAIServer, client: openai.AsyncOpenAI
):
request_args = {
"model": MODEL_NAME,
"input": input_text,
@@ -381,7 +445,7 @@ async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenA
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
async def test_invocations_chat_request(server: RemoteOpenAIServer):
messages = [
{
"role": "user",

View File

@@ -138,7 +138,7 @@ def test_completion_request_batched(server: RemoteOpenAIServer, model_name: str)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str):
async def test_chat_request(server: RemoteOpenAIServer, model_name: str):
messages = [
{
"role": "user",
@@ -154,6 +154,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
},
]
# test chat request basic usage
chat_response = requests.post(
server.url_for("pooling"),
json={
@@ -193,6 +194,68 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
completion_poolings.model_dump(exclude={"id", "created"})
)
# test add_generation_prompt
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "messages": messages, "add_generation_prompt": True},
)
response.raise_for_status()
output = PoolingResponse.model_validate(response.json())
assert output.object == "list"
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert output.usage.prompt_tokens == 33
# test continue_final_message
# The continue_final_message parameter doesn't seem to be working with this model.
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"messages": messages,
"continue_final_message": True,
},
)
response.raise_for_status()
output = PoolingResponse.model_validate(response.json())
assert output.object == "list"
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert output.usage.prompt_tokens == 33
# test add_special_tokens
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "messages": messages, "add_special_tokens": True},
)
response.raise_for_status()
output = PoolingResponse.model_validate(response.json())
assert output.object == "list"
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert output.usage.prompt_tokens == 34
# test continue_final_message with add_generation_prompt
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"messages": messages,
"continue_final_message": True,
"add_generation_prompt": True,
},
)
assert (
"Cannot set both `continue_final_message` and `add_generation_prompt` to True."
in response.json()["error"]["message"]
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@@ -430,7 +493,7 @@ async def test_params_not_supported(
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
async def test_invocations_chat_request(server: RemoteOpenAIServer):
request_args = {
"model": MODEL_NAME,
"input": input_text,
@@ -462,7 +525,7 @@ async def test_invocations(server: RemoteOpenAIServer):
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
async def test_invocations_conversation_chat_request(server: RemoteOpenAIServer):
messages = [
{
"role": "user",