diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 9b7018e48..6585643b5 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -694,7 +694,7 @@ Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](. #### Single inference -You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. +You can pass a string to both `queries` and `documents`, forming a single sentence pair. ```bash curl -X 'POST' \ @@ -704,8 +704,8 @@ curl -X 'POST' \ -d '{ "model": "BAAI/bge-reranker-v2-m3", "encoding_format": "float", - "text_1": "What is the capital of France?", - "text_2": "The capital of France is Paris." + "queries": "What is the capital of France?", + "documents": "The capital of France is Paris." }' ``` @@ -730,9 +730,9 @@ curl -X 'POST' \ #### Batch inference -You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs -where each pair is built from `text_1` and a string in `text_2`. -The total number of pairs is `len(text_2)`. +You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs +where each pair is built from `queries` and a string in `documents`. +The total number of pairs is `len(documents)`. ??? console "Request" @@ -743,8 +743,8 @@ The total number of pairs is `len(text_2)`. -H 'Content-Type: application/json' \ -d '{ "model": "BAAI/bge-reranker-v2-m3", - "text_1": "What is the capital of France?", - "text_2": [ + "queries": "What is the capital of France?", + "documents": [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." ] @@ -775,9 +775,9 @@ The total number of pairs is `len(text_2)`. } ``` -You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs -where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). -The total number of pairs is `len(text_2)`. +You can pass a list to both `queries` and `documents`, forming multiple sentence pairs +where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`). +The total number of pairs is `len(documents)`. ??? console "Request" @@ -789,11 +789,11 @@ The total number of pairs is `len(text_2)`. -d '{ "model": "BAAI/bge-reranker-v2-m3", "encoding_format": "float", - "text_1": [ + "queries": [ "What is the capital of Brazil?", "What is the capital of France?" ], - "text_2": [ + "documents": [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." ] @@ -847,8 +847,8 @@ You can pass multi-modal inputs to scoring models by passing `content` including "http://localhost:8000/v1/score", json={ "model": "jinaai/jina-reranker-m0", - "text_1": "slm markdown", - "text_2": { + "queries": "slm markdown", + "documents": { "content": [ { "type": "image_url", diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index cbca50eb5..36f028515 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -21,8 +21,8 @@ def parse_args(): def main(args: Namespace): # Sample prompts. - text_1 = "What is the capital of France?" - texts_2 = [ + query = "What is the capital of France?" + documents = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris.", ] @@ -32,13 +32,13 @@ def main(args: Namespace): llm = LLM(**vars(args)) # Generate scores. The output is a list of ScoringRequestOutputs. - outputs = llm.score(text_1, texts_2) + outputs = llm.score(query, documents) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) - for text_2, output in zip(texts_2, outputs): + for document, output in zip(documents, outputs): score = output.outputs.score - print(f"Pair: {[text_1, text_2]!r} \nScore: {score}") + print(f"Pair: {[query, document]!r} \nScore: {score}") print("-" * 60) diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 7d5a1af8f..a969925ac 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -255,8 +255,8 @@ cat results.jsonl Add score requests to your batch file. The following is an example: ```text -{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} ``` You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model). diff --git a/examples/pooling/score/cohere_rerank_online.py b/examples/pooling/score/cohere_rerank_client.py similarity index 100% rename from examples/pooling/score/cohere_rerank_online.py rename to examples/pooling/score/cohere_rerank_client.py diff --git a/examples/pooling/score/qwen3_reranker_online.py b/examples/pooling/score/qwen3_reranker_online.py index 441c1709d..f117b3b5b 100644 --- a/examples/pooling/score/qwen3_reranker_online.py +++ b/examples/pooling/score/qwen3_reranker_online.py @@ -50,8 +50,8 @@ documents = [ # Request payload for the score API data = { "model": "Qwen/Qwen3-Reranker-0.6B", - "text_1": queries, - "text_2": documents, + "queries": queries, + "documents": documents, } diff --git a/examples/pooling/score/score_api_online.py b/examples/pooling/score/score_api_online.py index f63c2bb84..af2886f13 100644 --- a/examples/pooling/score/score_api_online.py +++ b/examples/pooling/score/score_api_online.py @@ -30,29 +30,35 @@ def main(args): api_url = f"http://{args.host}:{args.port}/score" model_name = args.model - text_1 = "What is the capital of Brazil?" - text_2 = "The capital of Brazil is Brasilia." - prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + queries = "What is the capital of Brazil?" + documents = "The capital of Brazil is Brasilia." + prompt = {"model": model_name, "queries": queries, "documents": documents} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("\nPrompt when text_1 and text_2 are both strings:") + print("\nPrompt when queries and documents are both strings:") pprint.pprint(prompt) print("\nScore Response:") pprint.pprint(score_response.json()) - text_1 = "What is the capital of France?" - text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] - prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + queries = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + prompt = {"model": model_name, "queries": queries, "documents": documents} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("\nPrompt when text_1 is string and text_2 is a list:") + print("\nPrompt when queries is string and documents is a list:") pprint.pprint(prompt) print("\nScore Response:") pprint.pprint(score_response.json()) - text_1 = ["What is the capital of Brazil?", "What is the capital of France?"] - text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] - prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + queries = ["What is the capital of Brazil?", "What is the capital of France?"] + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + prompt = {"model": model_name, "queries": queries, "documents": documents} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("\nPrompt when text_1 and text_2 are both lists:") + print("\nPrompt when queries and documents are both lists:") pprint.pprint(prompt) print("\nScore Response:") pprint.pprint(score_response.json()) diff --git a/examples/pooling/score/vision_rerank_api_online.py b/examples/pooling/score/vision_rerank_api_online.py index 37a7decf3..d63ef2781 100644 --- a/examples/pooling/score/vision_rerank_api_online.py +++ b/examples/pooling/score/vision_rerank_api_online.py @@ -18,10 +18,22 @@ e.g. """ import argparse +import base64 import json import requests + +def encode_base64_content_from_url(content_url: str) -> dict[str, str]: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url, headers=headers) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode("utf-8") + + return {"url": f"data:image/jpeg;base64,{result}"} + + headers = {"accept": "application/json", "Content-Type": "application/json"} query = "A woman playing with her dog on a beach at sunset." @@ -30,8 +42,8 @@ documents = { { "type": "text", "text": ( - "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501 - "as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501 + "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " + "as the dog offers its paw in a heartwarming display of companionship and trust." ), }, { @@ -40,6 +52,12 @@ documents = { "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" }, }, + { + "type": "image_url", + "image_url": encode_base64_content_from_url( + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + ), + }, ] } diff --git a/examples/pooling/score/vision_score_api_online.py b/examples/pooling/score/vision_score_api_online.py index b4b4825ee..01041b846 100644 --- a/examples/pooling/score/vision_score_api_online.py +++ b/examples/pooling/score/vision_score_api_online.py @@ -17,15 +17,27 @@ e.g. """ import argparse +import base64 import json import pprint import requests + +def encode_base64_content_from_url(content_url: str) -> dict[str, str]: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url, headers=headers) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode("utf-8") + + return {"url": f"data:image/jpeg;base64,{result}"} + + headers = {"accept": "application/json", "Content-Type": "application/json"} -text_1 = "slm markdown" -text_2 = { +queries = "slm markdown" +documents = { "content": [ { "type": "image_url", @@ -39,6 +51,12 @@ text_2 = { "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" }, }, + { + "type": "image_url", + "image_url": encode_base64_content_from_url( + "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + ), + }, ] } @@ -58,9 +76,9 @@ def main(args): response = requests.get(models_url, headers=headers) model = response.json()["data"][0]["id"] - prompt = {"model": model, "text_1": text_1, "text_2": text_2} + prompt = {"model": model, "queries": queries, "documents": documents} response = requests.post(score_url, headers=headers, json=prompt) - print("\nPrompt when text_1 is string and text_2 is a image list:") + print("\nPrompt when queries is string and documents is a image list:") pprint.pprint(prompt) print("\nScore Response:") print(json.dumps(response.json(), indent=2)) diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index f6f109990..d80357a75 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -32,8 +32,8 @@ INPUT_EMBEDDING_BATCH = ( '{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}' ) -INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" +INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} diff --git a/tests/entrypoints/pooling/classify/test_online.py b/tests/entrypoints/pooling/classify/test_online.py index 4703af0a4..0bd62b9f4 100644 --- a/tests/entrypoints/pooling/classify/test_online.py +++ b/tests/entrypoints/pooling/classify/test_online.py @@ -251,8 +251,8 @@ async def test_score(server: RemoteOpenAIServer, model_name: str): server.url_for("score"), json={ "model": model_name, - "text_1": "ping", - "text_2": "pong", + "queries": "ping", + "documents": "pong", }, ) assert response.json()["error"]["type"] == "BadRequestError" diff --git a/tests/entrypoints/pooling/score/test_offline.py b/tests/entrypoints/pooling/score/test_offline.py index c02c02cf2..4964d94e6 100644 --- a/tests/entrypoints/pooling/score/test_offline.py +++ b/tests/entrypoints/pooling/score/test_offline.py @@ -43,12 +43,12 @@ def llm(): def test_pooling_params(llm: LLM): def get_outputs(use_activation): - text_1 = "What is the capital of France?" - text_2 = "The capital of France is Paris." + queries = "What is the capital of France?" + documents = "The capital of France is Paris." outputs = llm.score( - text_1, - text_2, + queries, + documents, pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False, ) diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py index 053a836f6..1c74cf297 100644 --- a/tests/entrypoints/pooling/score/test_online_score.py +++ b/tests/entrypoints/pooling/score/test_online_score.py @@ -61,21 +61,18 @@ def runner(model: dict[str, Any], hf_runner): class TestModel: - def test_text_1_str_text_2_list( + def test_queries_str_documents_str( self, server: RemoteOpenAIServer, model: dict[str, Any], runner ): - text_1 = "What is the capital of France?" - text_2 = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] + queries = "What is the capital of France?" + documents = "The capital of France is Paris." score_response = requests.post( server.url_for("score"), json={ "model": model["name"], - "text_1": text_1, - "text_2": text_2, + "queries": queries, + "documents": documents, }, ) score_response.raise_for_status() @@ -83,46 +80,11 @@ class TestModel: assert score.id is not None assert score.data is not None - assert len(score.data) == 2 + assert len(score.data) == 1 vllm_outputs = [d.score for d in score.data] - text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]] - hf_outputs = run_transformers(runner, model, text_pairs) - - for i in range(len(vllm_outputs)): - assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - - def test_text_1_list_text_2_list( - self, server: RemoteOpenAIServer, model: dict[str, Any], runner - ): - text_1 = [ - "What is the capital of the United States?", - "What is the capital of France?", - ] - text_2 = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "text_1": text_1, - "text_2": text_2, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 2 - - vllm_outputs = [d.score for d in score.data] - - text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]] + text_pairs = [[queries, documents]] hf_outputs = run_transformers(runner, model, text_pairs) for i in range(len(vllm_outputs)): @@ -157,11 +119,40 @@ class TestModel: for i in range(len(vllm_outputs)): assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - def test_score_max_model_len( - self, server: RemoteOpenAIServer, model: dict[str, Any] + def test_data_1_str_data_2_str( + self, server: RemoteOpenAIServer, model: dict[str, Any], runner ): - text_1 = "What is the capital of France?" * 20 - text_2 = [ + data_1 = "What is the capital of France?" + data_2 = "The capital of France is Paris." + + score_response = requests.post( + server.url_for("score"), + json={ + "model": model["name"], + "data_1": data_1, + "data_2": data_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 1 + + vllm_outputs = [d.score for d in score.data] + + text_pairs = [[data_1, data_2]] + hf_outputs = run_transformers(runner, model, text_pairs) + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + def test_queries_str_documents_list( + self, server: RemoteOpenAIServer, model: dict[str, Any], runner + ): + queries = "What is the capital of France?" + documents = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris.", ] @@ -170,8 +161,75 @@ class TestModel: server.url_for("score"), json={ "model": model["name"], - "text_1": text_1, - "text_2": text_2, + "queries": queries, + "documents": documents, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + + text_pairs = [[queries, documents[0]], [queries, documents[1]]] + hf_outputs = run_transformers(runner, model, text_pairs) + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + def test_queries_list_documents_list( + self, server: RemoteOpenAIServer, model: dict[str, Any], runner + ): + queries = [ + "What is the capital of the United States?", + "What is the capital of France?", + ] + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": model["name"], + "queries": queries, + "documents": documents, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + + text_pairs = [[queries[0], documents[0]], [queries[1], documents[1]]] + hf_outputs = run_transformers(runner, model, text_pairs) + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + def test_score_max_model_len( + self, server: RemoteOpenAIServer, model: dict[str, Any] + ): + queries = "What is the capital of France?" * 20 + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": model["name"], + "queries": queries, + "documents": documents, }, ) assert score_response.status_code == 400 @@ -183,8 +241,8 @@ class TestModel: server.url_for("score"), json={ "model": model["name"], - "text_1": text_1, - "text_2": text_2, + "queries": queries, + "documents": documents, "truncate_prompt_tokens": 101, }, ) @@ -192,13 +250,13 @@ class TestModel: assert "Please, select a smaller truncation size." in score_response.text def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]): - text_1 = "What is the capital of France?" - text_2 = "The capital of France is Paris." + queries = "What is the capital of France?" + documents = "The capital of France is Paris." request_args = { "model": model["name"], - "text_1": text_1, - "text_2": text_2, + "queries": queries, + "documents": documents, } score_response = requests.post(server.url_for("score"), json=request_args) @@ -225,14 +283,14 @@ class TestModel: def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): def get_outputs(use_activation): - text_1 = "What is the capital of France?" - text_2 = "The capital of France is Paris." + queries = "What is the capital of France?" + documents = "The capital of France is Paris." response = requests.post( server.url_for("score"), json={ "model": model["name"], - "text_1": text_1, - "text_2": text_2, + "queries": queries, + "documents": documents, "use_activation": use_activation, }, ) diff --git a/tests/models/language/pooling_mteb_test/mteb_score_utils.py b/tests/models/language/pooling_mteb_test/mteb_score_utils.py index adc2cf3e4..d9e3521d9 100644 --- a/tests/models/language/pooling_mteb_test/mteb_score_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py @@ -117,8 +117,8 @@ class ScoreClientMtebEncoder(MtebCrossEncoderMixin): self.url, json={ "model": self.model_name, - "text_1": query, - "text_2": corpus, + "queries": query, + "documents": corpus, "truncate_prompt_tokens": -1, }, ).json() diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 3d535f72d..0f4ee51e7 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -84,8 +84,11 @@ from vllm.entrypoints.pooling.pooling.protocol import ( ) from vllm.entrypoints.pooling.score.protocol import ( RerankRequest, + ScoreDataRequest, + ScoreQueriesDocumentsRequest, ScoreRequest, ScoreResponse, + ScoreTextRequest, ) from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse @@ -1032,7 +1035,9 @@ class OpenAIServing: ( EmbeddingChatRequest, EmbeddingCompletionRequest, - ScoreRequest, + ScoreDataRequest, + ScoreTextRequest, + ScoreQueriesDocumentsRequest, RerankRequest, ClassificationCompletionRequest, ClassificationChatRequest, @@ -1042,7 +1047,9 @@ class OpenAIServing: # since these requests don't generate tokens. if token_num > self.max_model_len: operations: dict[type[AnyRequest], str] = { - ScoreRequest: "score", + ScoreDataRequest: "score", + ScoreTextRequest: "score", + ScoreQueriesDocumentsRequest: "score", ClassificationCompletionRequest: "classification", ClassificationChatRequest: "classification", } diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 6f7da404a..0673bf37a 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -85,7 +85,7 @@ class BatchRequestInput(OpenAIBaseModel): if url == "/v1/embeddings": return TypeAdapter(EmbeddingRequest).validate_python(value) if url.endswith("/score"): - return ScoreRequest.model_validate(value) + return TypeAdapter(ScoreRequest).validate_python(value) if url.endswith("/rerank"): return RerankRequest.model_validate(value) return TypeAdapter(BatchRequestInputBody).validate_python(value) diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index ddc3c3f49..67f1a34dd 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import Any +from typing import Any, TypeAlias from pydantic import ( BaseModel, @@ -19,10 +19,7 @@ from vllm.entrypoints.pooling.score.utils import ( from vllm.utils import random_uuid -class ScoreRequest(PoolingBasicRequestMixin): - text_1: list[str] | str | ScoreMultiModalParam - text_2: list[str] | str | ScoreMultiModalParam - +class ScoreRequestMixin(PoolingBasicRequestMixin): # --8<-- [start:score-extra-params] mm_processor_kwargs: dict[str, Any] | None = Field( default=None, @@ -53,6 +50,42 @@ class ScoreRequest(PoolingBasicRequestMixin): ) +class ScoreDataRequest(ScoreRequestMixin): + data_1: list[str] | str | ScoreMultiModalParam + data_2: list[str] | str | ScoreMultiModalParam + + +class ScoreQueriesDocumentsRequest(ScoreRequestMixin): + queries: list[str] | str | ScoreMultiModalParam + documents: list[str] | str | ScoreMultiModalParam + + @property + def data_1(self): + return self.queries + + @property + def data_2(self): + return self.documents + + +class ScoreTextRequest(ScoreRequestMixin): + text_1: list[str] | str | ScoreMultiModalParam + text_2: list[str] | str | ScoreMultiModalParam + + @property + def data_1(self): + return self.text_1 + + @property + def data_2(self): + return self.text_2 + + +ScoreRequest: TypeAlias = ( + ScoreQueriesDocumentsRequest | ScoreDataRequest | ScoreTextRequest +) + + class RerankRequest(PoolingBasicRequestMixin): query: str | ScoreMultiModalParam documents: list[str] | ScoreMultiModalParam diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index c0cb23faa..1040d2be1 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -66,15 +66,15 @@ class ServingScores(OpenAIServing): async def _embedding_score( self, tokenizer: TokenizerLike, - texts_1: list[str], - texts_2: list[str], + data_1: list[str], + data_2: list[str], request: RerankRequest | ScoreRequest, request_id: str, tokenization_kwargs: dict[str, Any] | None = None, lora_request: LoRARequest | None | None = None, trace_headers: Mapping[str, str] | None = None, ) -> list[PoolingRequestOutput] | ErrorResponse: - input_texts = texts_1 + texts_2 + input_texts = data_1 + data_2 engine_prompts: list[TokensPrompt] = [] tokenize_async = make_async( @@ -135,22 +135,22 @@ class ServingScores(OpenAIServing): async for i, res in result_generator: embeddings[i] = res - emb_texts_1: list[PoolingRequestOutput] = [] - emb_texts_2: list[PoolingRequestOutput] = [] + emb_data_1: list[PoolingRequestOutput] = [] + emb_data_2: list[PoolingRequestOutput] = [] - for i in range(0, len(texts_1)): + for i in range(0, len(data_1)): assert (emb := embeddings[i]) is not None - emb_texts_1.append(emb) + emb_data_1.append(emb) - for i in range(len(texts_1), len(embeddings)): + for i in range(len(data_1), len(embeddings)): assert (emb := embeddings[i]) is not None - emb_texts_2.append(emb) + emb_data_2.append(emb) - if len(emb_texts_1) == 1: - emb_texts_1 = emb_texts_1 * len(emb_texts_2) + if len(emb_data_1) == 1: + emb_data_1 = emb_data_1 * len(emb_data_2) final_res_batch = _cosine_similarity( - tokenizer=tokenizer, embed_1=emb_texts_1, embed_2=emb_texts_2 + tokenizer=tokenizer, embed_1=emb_data_1, embed_2=emb_data_2 ) return final_res_batch @@ -333,8 +333,8 @@ class ServingScores(OpenAIServing): else: return await self._embedding_score( tokenizer=tokenizer, - texts_1=data_1, # type: ignore[arg-type] - texts_2=data_2, # type: ignore[arg-type] + data_1=data_1, # type: ignore[arg-type] + data_2=data_2, # type: ignore[arg-type] request=request, request_id=request_id, tokenization_kwargs=tokenization_kwargs, @@ -361,8 +361,8 @@ class ServingScores(OpenAIServing): try: final_res_batch = await self._run_scoring( - request.text_1, - request.text_2, + request.data_1, + request.data_2, request, request_id, raw_request,