[Frontend][4/n] Make pooling entrypoints request schema consensus | ScoreRequest (#33060)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
wang.yuqi
2026-02-04 09:48:40 +08:00
committed by GitHub
parent 52ee21021a
commit 1b8fe6f7c4
8 changed files with 432 additions and 205 deletions

View File

@@ -89,6 +89,29 @@ def main(args):
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: text + image url")
prompt = {
"model": model,
"query": query,
"documents": {"content": [documents[0], documents[1]]},
}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: list")
prompt = {
"model": model,
"query": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
if __name__ == "__main__":
args = parse_args()

View File

@@ -92,6 +92,44 @@ def main(args):
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: text + image url")
prompt = {
"model": model,
"queries": query,
"documents": {"content": [documents[0], documents[1]]},
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: list")
prompt = {
"model": model,
"queries": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: list & Document: list")
data = [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
]
prompt = {
"model": model,
"queries": data,
"documents": data,
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
if __name__ == "__main__":
args = parse_args()

View File

@@ -90,6 +90,35 @@ class TestModel:
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_queries_str_items_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = "What is the capital of France?"
items = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"items": items,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[queries, items]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):

View File

@@ -5,7 +5,7 @@ import pytest
import requests
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.multimodal.utils import encode_image_url, fetch_image
MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
@@ -16,11 +16,12 @@ HF_OVERRIDES = {
}
query = "A cat standing in the snow."
document = "This product was excellent and exceeded my expectations."
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
documents = [
{
"type": "text",
"text": query,
"text": document,
},
{
"type": "image_url",
@@ -32,6 +33,11 @@ documents = [
},
]
TEXT_VS_TEXT = 0.10040374100208282
TEXT_VS_IMAGE = 0.7423753142356873
TEXT_VS_TEXT_PLUS_IMAGE = 0.5298863053321838
TOL = 0.05
@pytest.fixture(scope="module")
def server():
@@ -50,15 +56,12 @@ def server():
def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": queries,
"documents": documents,
"queries": query,
"documents": document,
},
)
score_response.raise_for_status()
@@ -67,6 +70,8 @@ def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.usage.prompt_tokens == 81
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer):
@@ -84,6 +89,8 @@ def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.usage.prompt_tokens == 81
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer):
@@ -101,6 +108,8 @@ def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIS
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.usage.prompt_tokens == 98
assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
def test_score_api_queries_str_documents_image_base64_content(
@@ -120,3 +129,111 @@ def test_score_api_queries_str_documents_image_base64_content(
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.usage.prompt_tokens == 98
assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
def test_score_api_queries_str_documents_image_url_plus_text_content(
server: RemoteOpenAIServer,
):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": {"content": [documents[0], documents[1]]},
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.usage.prompt_tokens == 108
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
def test_score_api_queries_str_documents_list(server: RemoteOpenAIServer):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 4
assert score.usage.prompt_tokens == 368
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
def test_rerank_api_queries_str_documents_list(server: RemoteOpenAIServer):
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.model is not None
assert rerank.usage is not None
assert len(rerank.results) == 4
rerank.results.sort(key=lambda x: x.index)
assert rerank.results[0].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert rerank.results[1].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert rerank.results[2].relevance_score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
assert rerank.results[3].relevance_score == pytest.approx(
TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL
)
def test_score_api_queries_list_documents_list(server: RemoteOpenAIServer):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": [query] * 4,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 4
assert score.usage.prompt_tokens == 368
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)

View File

@@ -40,12 +40,12 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
)
from vllm.entrypoints.pooling.score.utils import (
ScoreContentPartParam,
ScoreData,
ScoreMultiModalParam,
_cosine_similarity,
_validate_score_input_lens,
compress_token_type_ids,
get_score_prompt,
validate_score_input,
)
from vllm.entrypoints.utils import log_non_default_args
from vllm.inputs import (
@@ -1326,8 +1326,8 @@ class LLM:
def _embedding_score(
self,
text_1: list[SingletonPrompt],
text_2: list[SingletonPrompt],
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
@@ -1336,8 +1336,16 @@ class LLM:
) -> list[ScoringRequestOutput]:
tokenizer = self.get_tokenizer()
input_texts: list[str] = []
for text in data_1 + data_2:
if not isinstance(text, str):
raise NotImplementedError(
"Embedding scores currently do not support multimodal input."
)
input_texts.append(text)
encoded_output = self.encode(
text_1 + text_2,
input_texts,
use_tqdm=use_tqdm,
lora_request=lora_request,
pooling_params=pooling_params,
@@ -1345,8 +1353,8 @@ class LLM:
tokenization_kwargs=tokenization_kwargs,
)
encoded_output_1 = encoded_output[0 : len(text_1)]
encoded_output_2 = encoded_output[len(text_1) :]
encoded_output_1 = encoded_output[0 : len(data_1)]
encoded_output_2 = encoded_output[len(data_1) :]
if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
@@ -1362,8 +1370,8 @@ class LLM:
def _cross_encoding_score(
self,
data_1: list[str] | list[ScoreContentPartParam],
data_2: list[str] | list[ScoreContentPartParam],
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
@@ -1424,8 +1432,14 @@ class LLM:
def score(
self,
data_1: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
data_2: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
data_1: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
data_2: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
/,
*,
use_tqdm: bool | Callable[..., tqdm] = True,
@@ -1501,73 +1515,23 @@ class LLM:
"chat_template is only supported for cross-encoder models."
)
# the tokenizer for models such as
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
# lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.get_tokenizer()
is_multimodal_model = model_config.is_multimodal_model
architecture = model_config.architecture
if not model_config.is_multimodal_model:
def check_data_type(
data: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam,
):
if isinstance(data, dict) and "content" in data:
raise ValueError(
"ScoreMultiModalParam is not supported "
f"for {model_config.architecture}"
)
check_data_type(data_1)
check_data_type(data_2)
def ensure_str(prompt: SingletonPrompt):
if isinstance(prompt, dict):
if "multi_modal_data" in prompt:
raise ValueError(
"Multi-modal prompt is not supported for scoring"
)
elif "prompt_token_ids" in prompt:
prompt = tokenizer.decode(
cast(TokensPrompt, prompt)["prompt_token_ids"]
)
elif "prompt" in prompt:
prompt = cast(TextPrompt, prompt)["prompt"]
assert type(prompt) is str
return prompt
if isinstance(data_1, (str, dict)):
# Convert a single prompt to a list.
data_1 = [data_1] # type: ignore[list-item]
data_1 = [ensure_str(t) for t in data_1]
if isinstance(data_2, (str, dict)):
# Convert a single prompt to a list.
data_2 = [data_2] # type: ignore[list-item]
data_2 = [ensure_str(t) for t in data_2]
if isinstance(data_1, dict) and "content" in data_1:
data_1 = data_1.get("content") # type: ignore[assignment]
elif isinstance(data_1, str):
data_1 = [data_1]
if isinstance(data_2, dict) and "content" in data_2:
data_2 = data_2.get("content") # type: ignore[assignment]
elif isinstance(data_2, str):
data_2 = [data_2]
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
score_data_1, score_data_2 = validate_score_input(
data_1, # type: ignore[arg-type]
data_2, # type: ignore[arg-type]
is_multimodal_model=is_multimodal_model,
architecture=architecture,
)
tok_params = self._get_cmpl_tok_params(tokenization_kwargs)
encode_kwargs = tok_params.get_encode_kwargs()
if model_config.is_cross_encoder:
return self._cross_encoding_score(
data_1, # type: ignore[arg-type]
data_2, # type: ignore[arg-type]
score_data_1,
score_data_2,
use_tqdm=use_tqdm,
pooling_params=pooling_params,
lora_request=lora_request,
@@ -1576,8 +1540,8 @@ class LLM:
)
else:
return self._embedding_score(
data_1, # type: ignore[arg-type]
data_2, # type: ignore[arg-type]
score_data_1,
score_data_2,
use_tqdm=use_tqdm,
pooling_params=pooling_params,
lora_request=lora_request,

View File

@@ -14,7 +14,8 @@ from vllm.entrypoints.pooling.base.protocol import (
)
from vllm.entrypoints.pooling.score.utils import (
ScoreContentPartParam,
ScoreMultiModalParam,
ScoreInput,
ScoreInputs,
)
from vllm.renderers import TokenizeParams
from vllm.utils import random_uuid
@@ -47,13 +48,13 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
class ScoreDataRequest(ScoreRequestMixin):
data_1: list[str] | str | ScoreMultiModalParam
data_2: list[str] | str | ScoreMultiModalParam
data_1: ScoreInputs
data_2: ScoreInputs
class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
queries: list[str] | str | ScoreMultiModalParam
documents: list[str] | str | ScoreMultiModalParam
queries: ScoreInputs
documents: ScoreInputs
@property
def data_1(self):
@@ -64,9 +65,22 @@ class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
return self.documents
class ScoreQueriesItemsRequest(ScoreRequestMixin):
queries: ScoreInputs
items: ScoreInputs
@property
def data_1(self):
return self.queries
@property
def data_2(self):
return self.items
class ScoreTextRequest(ScoreRequestMixin):
text_1: list[str] | str | ScoreMultiModalParam
text_2: list[str] | str | ScoreMultiModalParam
text_1: ScoreInputs
text_2: ScoreInputs
@property
def data_1(self):
@@ -78,13 +92,16 @@ class ScoreTextRequest(ScoreRequestMixin):
ScoreRequest: TypeAlias = (
ScoreQueriesDocumentsRequest | ScoreDataRequest | ScoreTextRequest
ScoreQueriesDocumentsRequest
| ScoreQueriesItemsRequest
| ScoreDataRequest
| ScoreTextRequest
)
class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
query: str | ScoreMultiModalParam
documents: list[str] | ScoreMultiModalParam
query: ScoreInput
documents: ScoreInputs
top_n: int = Field(default_factory=lambda: 0)
# --8<-- [start:rerank-extra-params]
@@ -108,7 +125,7 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
class RerankDocument(BaseModel):
text: str | None = None
multi_modal: ScoreContentPartParam | None = None
multi_modal: list[ScoreContentPartParam] | None = None
class RerankResult(BaseModel):

View File

@@ -27,12 +27,12 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreResponseData,
)
from vllm.entrypoints.pooling.score.utils import (
ScoreContentPartParam,
ScoreMultiModalParam,
ScoreData,
ScoreInputs,
_cosine_similarity,
_validate_score_input_lens,
compress_token_type_ids,
get_score_prompt,
validate_score_input,
)
from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger
@@ -65,15 +65,32 @@ class ServingScores(OpenAIServing):
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
self.is_cross_encoder = self.model_config.is_cross_encoder
self.is_multimodal_model = self.model_config.is_multimodal_model
self.architecture = self.model_config.architecture
if self.is_cross_encoder:
self._score_func = self._cross_encoding_score
else:
self._score_func = self._embedding_score
async def _embedding_score(
self,
data_1: list[str],
data_2: list[str],
data_1: list[ScoreData],
data_2: list[ScoreData],
request: RerankRequest | ScoreRequest,
request_id: str,
lora_request: LoRARequest | None | None = None,
trace_headers: Mapping[str, str] | None = None,
) -> list[PoolingRequestOutput] | ErrorResponse:
input_texts: list[str] = []
for text in data_1 + data_2:
if not isinstance(text, str):
raise NotImplementedError(
"Embedding scores currently do not support multimodal input."
)
input_texts.append(text)
model_config = self.model_config
tokenizer = self.renderer.get_tokenizer()
@@ -82,8 +99,6 @@ class ServingScores(OpenAIServing):
executor=self._tokenizer_executor,
)
input_texts = data_1 + data_2
tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
tokenized_prompts = await asyncio.gather(
*(encode_async(t, **tokenization_kwargs) for t in input_texts)
@@ -157,60 +172,30 @@ class ServingScores(OpenAIServing):
return final_res_batch
def _preprocess_score(
self,
request: RerankRequest | ScoreRequest,
tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
) -> tuple[str, TokensPrompt]:
model_config = self.model_config
full_prompt, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=data_1,
data_2=data_2,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
score_template=self.score_template,
)
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
if request.mm_processor_kwargs is not None:
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
return full_prompt, engine_prompt
async def _cross_encoding_score(
self,
data_1: list[str] | list[ScoreContentPartParam],
data_2: list[str] | list[ScoreContentPartParam],
data_1: list[ScoreData],
data_2: list[ScoreData],
request: RerankRequest | ScoreRequest,
request_id: str,
lora_request: LoRARequest | None | None = None,
trace_headers: Mapping[str, str] | None = None,
) -> list[PoolingRequestOutput] | ErrorResponse:
model_config = self.model_config
tokenizer = self.renderer.get_tokenizer()
if isinstance(tokenizer, MistralTokenizer):
raise ValueError("MistralTokenizer not supported for cross-encoding")
request_prompts: list[str] = []
engine_prompts: list[TokensPrompt] = []
model_config = self.model_config
if len(data_1) == 1:
data_1 = data_1 * len(data_2)
if isinstance(tokenizer, MistralTokenizer):
raise ValueError("MistralTokenizer not supported for cross-encoding")
tok_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
preprocess_async = make_async(
self._preprocess_score,
executor=self._tokenizer_executor,
)
preprocessed_prompts = await asyncio.gather(
*(
preprocess_async(
@@ -224,6 +209,8 @@ class ServingScores(OpenAIServing):
)
)
request_prompts: list[str] = []
engine_prompts: list[TokensPrompt] = []
for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt)
@@ -278,10 +265,33 @@ class ServingScores(OpenAIServing):
return [out for out in final_res_batch if out is not None]
def _preprocess_score(
self,
request: RerankRequest | ScoreRequest,
tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: ScoreData,
data_2: ScoreData,
) -> tuple[str, TokensPrompt]:
model_config = self.model_config
full_prompt, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=data_1,
data_2=data_2,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
score_template=self.score_template,
)
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
if request.mm_processor_kwargs is not None:
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
return full_prompt, engine_prompt
async def _run_scoring(
self,
data_1: list[str] | str | ScoreMultiModalParam,
data_2: list[str] | str | ScoreMultiModalParam,
data_1: ScoreInputs,
data_2: ScoreInputs,
request: ScoreRequest | RerankRequest,
request_id: str,
raw_request: Request | None = None,
@@ -294,44 +304,21 @@ class ServingScores(OpenAIServing):
else await self._get_trace_headers(raw_request.headers)
)
if not self.model_config.is_multimodal_model and (
isinstance(data_1, dict) or isinstance(data_2, dict)
):
raise ValueError(
f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501
)
score_data_1, score_data_2 = validate_score_input(
data_1,
data_2,
is_multimodal_model=self.is_multimodal_model,
architecture=self.architecture,
)
if isinstance(data_1, str):
data_1 = [data_1]
elif isinstance(data_1, dict):
data_1 = data_1.get("content") # type: ignore[assignment]
if isinstance(data_2, str):
data_2 = [data_2]
elif isinstance(data_2, dict):
data_2 = data_2.get("content") # type: ignore[assignment]
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
if self.model_config.is_cross_encoder:
return await self._cross_encoding_score(
data_1=data_1, # type: ignore[arg-type]
data_2=data_2, # type: ignore[arg-type]
request=request,
request_id=request_id,
lora_request=lora_request,
trace_headers=trace_headers,
)
else:
return await self._embedding_score(
data_1=data_1, # type: ignore[arg-type]
data_2=data_2, # type: ignore[arg-type]
request=request,
request_id=request_id,
lora_request=lora_request,
trace_headers=trace_headers,
)
return await self._score_func(
data_1=score_data_1,
data_2=score_data_2,
request=request,
request_id=request_id,
lora_request=lora_request,
trace_headers=trace_headers,
)
async def create_score(
self,
@@ -391,15 +378,6 @@ class ServingScores(OpenAIServing):
request_id = f"rerank-{self._base_request_id(raw_request)}"
documents = request.documents
top_n = (
request.top_n
if request.top_n > 0
else (
len(documents)
if isinstance(documents, list)
else len(documents["content"])
)
)
try:
final_res_batch = await self._run_scoring(
@@ -412,6 +390,8 @@ class ServingScores(OpenAIServing):
if isinstance(final_res_batch, ErrorResponse):
return final_res_batch
top_n = request.top_n if request.top_n > 0 else len(final_res_batch)
return self.request_output_to_rerank_response(
final_res_batch,
request_id,
@@ -465,22 +445,32 @@ class ServingScores(OpenAIServing):
final_res_batch: list[PoolingRequestOutput],
request_id: str,
model_name: str,
documents: list[str] | ScoreMultiModalParam,
documents: ScoreInputs,
top_n: int,
) -> RerankResponse:
"""
Convert the output of do_rank to a RerankResponse
"""
if not isinstance(documents, list):
documents = [documents]
results: list[RerankResult] = []
num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch):
classify_res = ScoringRequestOutput.from_base(final_res)
document = documents[idx]
if isinstance(document, str):
rerank_document = RerankDocument(text=document)
else:
rerank_document = RerankDocument(
multi_modal=document.get("content", [])
)
result = RerankResult(
index=idx,
document=RerankDocument(text=documents[idx])
if isinstance(documents, list)
else RerankDocument(multi_modal=documents["content"][idx]),
document=rerank_document,
relevance_score=classify_res.outputs.score,
)
results.append(result)

View File

@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Any, TypeAlias, cast
from torch.nn import CosineSimilarity
@@ -10,12 +11,13 @@ from vllm.entrypoints.chat_utils import (
BaseMultiModalItemTracker,
ChatCompletionContentPartImageEmbedsParam,
ChatCompletionContentPartImageParam,
ChatCompletionContentPartParam,
ChatCompletionContentPartTextParam,
ChatCompletionContentPartVideoParam,
ChatTemplateResolutionError,
ConversationMessage,
MultiModalItemTracker,
_ContentPart,
_parse_chat_message_content_part,
_parse_chat_message_content_parts,
)
from vllm.inputs import TokensPrompt
from vllm.model_executor.models.interfaces import supports_score_template
@@ -46,6 +48,13 @@ class ScoreMultiModalParam(TypedDict, total=False):
"""The multimodal contents"""
# Raw input data with content key in ScoreMultiModalParam.
ScoreInput = str | ScoreMultiModalParam
ScoreInputs = ScoreInput | list[ScoreInput]
# Score data without content key.
ScoreData = str | list[ScoreContentPartParam]
def _cosine_similarity(
tokenizer: TokenizerLike,
embed_1: list[PoolingRequestOutput],
@@ -77,8 +86,8 @@ def _cosine_similarity(
def _validate_score_input_lens(
data_1: list[str] | list[ScoreContentPartParam],
data_2: list[str] | list[ScoreContentPartParam],
data_1: list[ScoreData],
data_2: list[ScoreData],
):
len_1 = len(data_1)
len_2 = len(data_2)
@@ -91,19 +100,56 @@ def _validate_score_input_lens(
raise ValueError("At least one text_pair element must be given")
def _validate_mm_score_input(
data: list[ScoreInput],
is_multimodal_model: bool,
architecture: str,
) -> list[ScoreData]:
out: list[ScoreData] = []
for d in data:
if isinstance(d, str):
out.append(d)
else:
if not is_multimodal_model:
raise ValueError(f"MultiModalParam is not supported for {architecture}")
content = cast(list[ScoreContentPartParam], d.get("content", []))
out.append(content)
return out
def validate_score_input(
data_1: ScoreInputs,
data_2: ScoreInputs,
is_multimodal_model: bool,
architecture: str,
) -> tuple[list[ScoreData], list[ScoreData]]:
if not isinstance(data_1, list):
data_1 = [data_1]
if not isinstance(data_2, list):
data_2 = [data_2]
score_input_1 = _validate_mm_score_input(data_1, is_multimodal_model, architecture)
score_input_2 = _validate_mm_score_input(data_2, is_multimodal_model, architecture)
_validate_score_input_lens(score_input_1, score_input_2)
return score_input_1, score_input_2
def parse_score_data(
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
data_1: ScoreData,
data_2: ScoreData,
model_config: ModelConfig,
) -> tuple[str, str, MultiModalDataDict | None, MultiModalUUIDDict | None]:
mm_tracker = MultiModalItemTracker(model_config)
content_1 = _parse_score_content(data_1, mm_tracker)
content_2 = _parse_score_content(data_2, mm_tracker)
content_1 = _parse_score_content("query", data_1, mm_tracker)
content_2 = _parse_score_content("document", data_2, mm_tracker)
def ensure_str(content: _ContentPart | None) -> str:
if content is not None and isinstance(content, str):
return cast(str, content)
def ensure_str(content: list[ConversationMessage]) -> str:
assert len(content) == 1
prompt = content[0]["content"]
if prompt is not None and isinstance(prompt, str):
return cast(str, prompt)
else:
raise ValueError(f"Only string content is supported, but got {content}.")
@@ -115,19 +161,22 @@ def parse_score_data(
def _parse_score_content(
data: str | ScoreContentPartParam,
role: str,
data: ScoreData,
mm_tracker: BaseMultiModalItemTracker,
) -> _ContentPart | None:
) -> list[ConversationMessage]:
parts: Iterable[ChatCompletionContentPartParam]
if isinstance(data, str):
part = ChatCompletionContentPartTextParam(type="text", text=data)
parts = [ChatCompletionContentPartTextParam(type="text", text=data)]
else:
part = data
parts = cast(Iterable[ChatCompletionContentPartParam], data)
mm_parser = mm_tracker.create_parser()
parse_res = _parse_chat_message_content_part(
part,
mm_parser,
parse_res = _parse_chat_message_content_parts(
role=role,
parts=parts,
mm_tracker=mm_tracker,
wrap_dicts=False,
interleave_strings=False,
)
@@ -184,8 +233,8 @@ def get_score_prompt(
model_config: ModelConfig,
tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
data_1: ScoreData,
data_2: ScoreData,
score_template: str | None = None,
) -> tuple[str, TokensPrompt]:
prompt_1, prompt_2, mm_data, mm_uuids = parse_score_data(