Generative Scoring (#34539)

Signed-off-by: Vedant Jhaveri <vjhaveri@linkedin.com>
Co-authored-by: Vedant Jhaveri <vjhaveri@linkedin.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
Vedant V Jhaveri
2026-03-31 16:02:11 -07:00
committed by GitHub
parent 36f1dc19ae
commit 2e56975657
13 changed files with 1265 additions and 3 deletions

View File

@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

View File

@@ -0,0 +1,325 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the Generative Scoring API.
Tests cover:
1. Protocol models (request/response construction)
2. Probability computation (softmax normalization)
3. Input validation
4. Score formula: P(token[0]) / (P(token[0]) + P(token[1]))
5. Prompt building and item ordering
"""
import math
from dataclasses import dataclass, field
from typing import Any
from unittest.mock import MagicMock
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.generative_scoring.serving import (
GenerativeScoringItemResult,
GenerativeScoringRequest,
GenerativeScoringResponse,
OpenAIServingGenerativeScoring,
)
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.logprobs import Logprob
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.tokenizers import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
MODEL_NAME = "Qwen/Qwen3-0.6B"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
task = "generate"
runner_type = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processor_pattern = None
logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False
vocab_size = 151936
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
def get_vocab_size(self):
return self.vocab_size
def _create_mock_engine():
"""Create a mock AsyncLLM engine."""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
# renderer is accessed by OpenAIServing.__init__ and serving.py
mock_renderer = MagicMock()
mock_renderer.tokenizer = get_tokenizer(MODEL_NAME)
mock_engine.renderer = mock_renderer
return mock_engine
def _create_serving(mock_engine) -> OpenAIServingGenerativeScoring:
"""Create an OpenAIServingGenerativeScoring instance with mocks."""
models = OpenAIServingModels(
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
)
return OpenAIServingGenerativeScoring(mock_engine, models, request_logger=None)
def _create_mock_request_output(logprobs_dict: dict[int, float]) -> RequestOutput:
"""Create a mock RequestOutput with specified logprobs."""
logprobs_with_objs = {
tid: Logprob(logprob=lp, rank=i + 1)
for i, (tid, lp) in enumerate(logprobs_dict.items())
}
completion_output = CompletionOutput(
index=0,
text="",
token_ids=[100],
cumulative_logprob=-1.0,
logprobs=[logprobs_with_objs],
finish_reason="length",
)
return RequestOutput(
request_id="test-request",
prompt="test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output],
finished=True,
)
class TestProtocolModels:
"""Tests for GenerativeScoringRequest and GenerativeScoringResponse."""
def test_request_and_response_all_fields(self):
"""Test request construction with all field types and response structure."""
# Test request with string inputs
req_str = GenerativeScoringRequest(
query="Is this the capital?",
items=["Paris", "London"],
label_token_ids=[9454, 2753],
)
assert req_str.query == "Is this the capital?"
assert req_str.items == ["Paris", "London"]
assert req_str.label_token_ids == [9454, 2753]
assert req_str.apply_softmax is True # default
assert req_str.item_first is False # default
assert req_str.add_special_tokens is True # default
# Test request with pre-tokenized inputs and custom options
req_tok = GenerativeScoringRequest(
query=[100, 200, 300],
items=[[400, 500], [600, 700]],
label_token_ids=[1234, 5678],
apply_softmax=False,
item_first=True,
add_special_tokens=False,
)
assert req_tok.query == [100, 200, 300]
assert req_tok.items == [[400, 500], [600, 700]]
assert req_tok.apply_softmax is False
assert req_tok.item_first is True
assert req_tok.add_special_tokens is False
# Test response structure
response = GenerativeScoringResponse(
model="test-model",
data=[
GenerativeScoringItemResult(index=0, score=0.7),
GenerativeScoringItemResult(index=1, score=0.4),
],
usage={"prompt_tokens": 10, "total_tokens": 12, "completion_tokens": 2},
)
assert response.object == "list"
assert response.model == "test-model"
assert len(response.data) == 2
assert response.data[0].score == 0.7
assert response.data[0].object == "score"
assert response.data[1].score == 0.4
assert response.usage.prompt_tokens == 10
class TestProbabilityComputation:
"""Tests for _compute_probabilities with both softmax modes."""
@pytest.mark.parametrize(
"label_logprobs,apply_softmax,should_sum_to_one",
[
({100: -1.0, 200: -2.0}, True, True),
({100: -100.0, 200: -100.5}, True, True), # numerical stability
({100: -1.0, 200: -2.0}, False, False),
],
ids=["softmax_basic", "softmax_extreme_values", "true_probs"],
)
def test_compute_probabilities(
self, label_logprobs, apply_softmax, should_sum_to_one
):
"""Test probability computation for softmax and true probability modes."""
serving = OpenAIServingGenerativeScoring.__new__(OpenAIServingGenerativeScoring)
probs = serving._compute_probabilities(
label_logprobs, apply_softmax=apply_softmax
)
# Verify sum behavior
total = sum(probs.values())
if should_sum_to_one:
assert abs(total - 1.0) < 1e-6
else:
assert total < 1.0
# Verify math
if apply_softmax:
max_lp = max(label_logprobs.values())
exp_vals = {k: math.exp(v - max_lp) for k, v in label_logprobs.items()}
sum_exp = sum(exp_vals.values())
for tid, lp in label_logprobs.items():
assert abs(probs[tid] - exp_vals[tid] / sum_exp) < 1e-9
else:
for tid, lp in label_logprobs.items():
assert abs(probs[tid] - math.exp(lp)) < 1e-9
def test_score_formula(self):
"""Test the score formula: P(token[0]) / (P(token[0]) + P(token[1]))."""
serving = OpenAIServingGenerativeScoring.__new__(OpenAIServingGenerativeScoring)
# With logprobs -0.5 and -2.0, softmax gives higher prob to first token
logprobs = {9454: -0.5, 2753: -2.0}
probs = serving._compute_probabilities(logprobs, apply_softmax=True)
# Score = P(9454) / (P(9454) + P(2753)) = P(9454) since they sum to 1
score = probs[9454]
# Manual calculation
exp_0 = math.exp(-0.5)
exp_1 = math.exp(-2.0)
expected_score = exp_0 / (exp_0 + exp_1)
assert abs(score - expected_score) < 1e-9
assert score > 0.5 # First token has higher logprob, so higher probability
class TestValidation:
"""Tests for input validation errors."""
@pytest.mark.asyncio
@pytest.mark.parametrize(
"request_kwargs,expected_error",
[
(
{"query": "q", "items": ["i"], "label_token_ids": [999999, 999998]},
"out of vocabulary",
),
(
{"query": "q", "items": [], "label_token_ids": [100, 200]},
"at least one item",
),
],
ids=["invalid_token_id", "empty_items"],
)
async def test_validation_errors(self, request_kwargs, expected_error):
"""Test that invalid inputs return appropriate errors."""
mock_engine = _create_mock_engine()
serving = _create_serving(mock_engine)
request = GenerativeScoringRequest(model=MODEL_NAME, **request_kwargs)
result = await serving.create_generative_scoring(request, None)
assert isinstance(result, ErrorResponse)
assert expected_error in result.error.message.lower()
class TestPromptBuilding:
"""Tests for prompt construction and item ordering."""
@pytest.mark.asyncio
@pytest.mark.parametrize(
"item_first,expected",
[
(False, [[100, 101, 200, 201], [100, 101, 300, 301]]), # query + item
(True, [[200, 201, 100, 101], [300, 301, 100, 101]]), # item + query
],
ids=["query_first", "item_first"],
)
async def test_item_ordering(self, item_first, expected):
"""Test that item_first flag controls prompt concatenation order."""
mock_engine = _create_mock_engine()
serving = _create_serving(mock_engine)
request = GenerativeScoringRequest(
query=[100, 101],
items=[[200, 201], [300, 301]],
label_token_ids=[500, 501],
item_first=item_first,
)
engine_inputs, _ = await serving._build_prompts(
request, MagicMock(), max_model_len=4096
)
for i, exp in enumerate(expected):
assert engine_inputs[i]["prompt_token_ids"] == exp
class TestGeneration:
"""Tests for the full generation flow with mocked engine."""
@pytest.mark.asyncio
async def test_successful_generation(self):
"""Test successful score generation returns valid response."""
mock_engine = _create_mock_engine()
serving = _create_serving(mock_engine)
mock_logprobs = {1234: -0.5, 5678: -2.0, 100: -3.0}
mock_output = _create_mock_request_output(mock_logprobs)
async def mock_generate(*args, **kwargs):
yield mock_output
mock_engine.generate = mock_generate
request = GenerativeScoringRequest(
model=MODEL_NAME,
query="Is Paris the capital?",
items=["Yes", "No"],
label_token_ids=[1234, 5678],
)
result = await serving.create_generative_scoring(request, None)
assert isinstance(result, GenerativeScoringResponse)
assert len(result.data) == 2
for item_result in result.data:
assert 0.0 <= item_result.score <= 1.0
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -0,0 +1,157 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""End-to-end tests for the Generative Scoring API.
Tests verify the full HTTP request/response flow using RemoteOpenAIServer.
"""
import pytest
import requests
from ....utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def server():
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"512",
"--enforce-eager",
"--max-num-seqs",
"32",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
class TestGenerativeScoringAPI:
"""End-to-end tests for the Generative Scoring API."""
@pytest.mark.asyncio
async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServer):
"""Test basic generative scoring request and verify response structure."""
response = requests.post(
server.url_for("generative_scoring"),
json={
"model": MODEL_NAME,
"query": "Is Paris the capital of France? Answer Yes or No: ",
"items": ["Paris is beautiful.", "London is rainy."],
"label_token_ids": [9454, 2753],
},
)
assert response.status_code == 200, f"Response: {response.text}"
data = response.json()
# Verify response structure
assert data["id"].startswith("generative-scoring-")
assert data["object"] == "list"
assert "model" in data
assert "usage" in data
assert len(data["data"]) == 2
# Verify each result
for i, result in enumerate(data["data"]):
assert result["index"] == i
assert result["object"] == "score"
assert 0.0 <= result["score"] <= 1.0
# Verify usage tracking
usage = data["usage"]
assert usage["prompt_tokens"] > 0
assert usage["completion_tokens"] > 0
assert (
usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]
)
@pytest.mark.asyncio
async def test_multiple_items(self, server: RemoteOpenAIServer):
"""Test generative scoring request with multiple items."""
response = requests.post(
server.url_for("generative_scoring"),
json={
"model": MODEL_NAME,
"query": "Is this city a capital? ",
"items": ["Paris", "London", "Berlin", "New York", "Tokyo"],
"label_token_ids": [9454, 2753],
},
)
assert response.status_code == 200
data = response.json()
assert len(data["data"]) == 5
@pytest.mark.asyncio
async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServer):
"""Test that missing label_token_ids returns a validation error."""
response = requests.post(
server.url_for("generative_scoring"),
json={
"model": MODEL_NAME,
"query": "Test query",
"items": ["item1", "item2"],
},
)
# Missing required field returns 400 (manual JSON parsing)
assert response.status_code == 400
@pytest.mark.asyncio
async def test_validation_empty_items(self, server: RemoteOpenAIServer):
"""Test that empty items returns an error."""
response = requests.post(
server.url_for("generative_scoring"),
json={
"model": MODEL_NAME,
"query": "Test query",
"items": [],
"label_token_ids": [100, 200],
},
)
assert response.status_code == 400
@pytest.mark.asyncio
@pytest.mark.parametrize(
"label_token_ids,expected_status",
[
([9999999999, 9999999998], 400), # Out of vocab range
],
ids=["invalid_token_ids"],
)
async def test_validation_errors(
self, server: RemoteOpenAIServer, label_token_ids, expected_status
):
"""Test validation errors for various invalid inputs."""
response = requests.post(
server.url_for("generative_scoring"),
json={
"model": MODEL_NAME,
"query": "Test query",
"items": ["item1"],
"label_token_ids": label_token_ids,
},
)
assert response.status_code == expected_status
@pytest.mark.asyncio
async def test_score_consistency(self, server: RemoteOpenAIServer):
"""Test that scores are deterministic across identical requests."""
request_body = {
"model": MODEL_NAME,
"query": "Is this consistent? ",
"items": ["Yes it is."],
"label_token_ids": [100, 200],
}
r1 = requests.post(server.url_for("generative_scoring"), json=request_body)
r2 = requests.post(server.url_for("generative_scoring"), json=request_body)
assert r1.status_code == 200 and r2.status_code == 200
r1_score = r1.json()["data"][0]["score"]
r2_score = r2.json()["data"][0]["score"]
assert abs(r1_score - r2_score) < 1e-6
if __name__ == "__main__":
pytest.main([__file__, "-v"])