[V0 Deprecation] Remove pooling model support in V0 (#23434)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Maximilien de Bayser
2025-08-29 04:04:02 -03:00
committed by GitHub
parent 934bebf192
commit 2554b27baa
38 changed files with 99 additions and 808 deletions

View File

@@ -72,8 +72,8 @@ STOP_ITERATION = Exception() # Sentinel
class AsyncStream:
"""A stream of RequestOutputs or PoolingRequestOutputs for a request
that can be iterated over asynchronously via an async generator."""
"""A stream of RequestOutputs for a request that can be iterated over
asynchronously via an async generator."""
def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
self.request_id = request_id
@@ -81,8 +81,7 @@ class AsyncStream:
self._queue: asyncio.Queue = asyncio.Queue()
self._finished = False
def put(self, item: Union[RequestOutput, PoolingRequestOutput,
Exception]) -> None:
def put(self, item: Union[RequestOutput, Exception]) -> None:
if not self._finished:
self._queue.put_nowait(item)
@@ -99,9 +98,7 @@ class AsyncStream:
def finished(self) -> bool:
return self._finished
async def generator(
self
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
async def generator(self) -> AsyncGenerator[RequestOutput, None]:
try:
while True:
result = await self._queue.get()
@@ -151,8 +148,7 @@ class RequestTracker:
self.abort_request(rid, exception=exc)
def process_request_output(self,
request_output: Union[RequestOutput,
PoolingRequestOutput],
request_output: RequestOutput,
*,
verbose: bool = False) -> None:
"""Process a request output from the engine."""
@@ -261,9 +257,7 @@ class _AsyncLLMEngine(LLMEngine):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
async def step_async(
self, virtual_engine: int
) -> List[Union[RequestOutput, PoolingRequestOutput]]:
async def step_async(self, virtual_engine: int) -> List[RequestOutput]:
"""Performs one decoding iteration and returns newly generated results.
The workers are ran asynchronously if possible.
@@ -405,7 +399,7 @@ class _AsyncLLMEngine(LLMEngine):
self,
request_id: str,
prompt: PromptType,
params: Union[SamplingParams, PoolingParams],
params: SamplingParams,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
@@ -779,14 +773,14 @@ class AsyncLLMEngine(EngineClient):
self,
request_id: str,
prompt: PromptType,
params: Union[SamplingParams, PoolingParams],
params: SamplingParams,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
) -> AsyncGenerator[RequestOutput, None]:
if not self.is_running:
if self.start_engine_loop:
self.start_background_loop()
@@ -908,7 +902,7 @@ class AsyncLLMEngine(EngineClient):
await self.abort(request_id)
raise
async def encode(
def encode(
self,
prompt: PromptType,
pooling_params: PoolingParams,
@@ -918,85 +912,8 @@ class AsyncLLMEngine(EngineClient):
priority: int = 0,
tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs
from the LLMEngine to the caller.
Args:
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
trace_headers: OpenTelemetry trace headers.
priority: The priority of the request.
Only applicable with priority scheduling.
Yields:
The output `PoolingRequestOutput` objects from the LLMEngine
for the request.
Details:
- If the engine is not running, start the background loop,
which iteratively invokes
[`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
the underlying engine.
Also, a corresponding `AsyncStream` will be created.
- Wait for the request outputs from `AsyncStream` and yield them.
Example:
```
# Please refer to entrypoints/api_server.py for
# the complete example.
# initialize the engine and the example input
# note that engine_args here is AsyncEngineArgs instance
engine = AsyncLLMEngine.from_engine_args(engine_args)
example_input = {
"input": "What is LLM?",
"request_id": 0,
}
# start the generation
results_generator = engine.encode(
example_input["input"],
PoolingParams(),
example_input["request_id"])
# get the results
final_output = None
async for request_output in results_generator:
if await request.is_disconnected():
# Abort the request if the client disconnects.
await engine.abort(request_id)
# Return or raise an error
...
final_output = request_output
# Process and return the final output
...
```
"""
try:
async for output in await self.add_request(
request_id,
prompt,
pooling_params,
lora_request=lora_request,
trace_headers=trace_headers,
priority=priority,
tokenization_kwargs=tokenization_kwargs,
):
yield LLMEngine.validate_output(output, PoolingRequestOutput)
except asyncio.CancelledError:
await self.abort(request_id)
raise
raise NotImplementedError(
"Pooling models are not supported in vLLM V0")
async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
"""Abort a request.
@@ -1104,8 +1021,8 @@ class AsyncLLMEngine(EngineClient):
async def is_sleeping(self) -> bool:
return self.engine.is_sleeping()
async def add_lora(self, lora_request: LoRARequest) -> None:
self.engine.add_lora(lora_request)
async def add_lora(self, lora_request: LoRARequest) -> bool:
return self.engine.add_lora(lora_request)
async def collective_rpc(self,
method: str,