[V0 Deprecation] Remove pooling model support in V0 (#23434)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
committed by
GitHub
parent
934bebf192
commit
2554b27baa
@@ -72,8 +72,8 @@ STOP_ITERATION = Exception() # Sentinel
|
||||
|
||||
|
||||
class AsyncStream:
|
||||
"""A stream of RequestOutputs or PoolingRequestOutputs for a request
|
||||
that can be iterated over asynchronously via an async generator."""
|
||||
"""A stream of RequestOutputs for a request that can be iterated over
|
||||
asynchronously via an async generator."""
|
||||
|
||||
def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
|
||||
self.request_id = request_id
|
||||
@@ -81,8 +81,7 @@ class AsyncStream:
|
||||
self._queue: asyncio.Queue = asyncio.Queue()
|
||||
self._finished = False
|
||||
|
||||
def put(self, item: Union[RequestOutput, PoolingRequestOutput,
|
||||
Exception]) -> None:
|
||||
def put(self, item: Union[RequestOutput, Exception]) -> None:
|
||||
if not self._finished:
|
||||
self._queue.put_nowait(item)
|
||||
|
||||
@@ -99,9 +98,7 @@ class AsyncStream:
|
||||
def finished(self) -> bool:
|
||||
return self._finished
|
||||
|
||||
async def generator(
|
||||
self
|
||||
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
|
||||
async def generator(self) -> AsyncGenerator[RequestOutput, None]:
|
||||
try:
|
||||
while True:
|
||||
result = await self._queue.get()
|
||||
@@ -151,8 +148,7 @@ class RequestTracker:
|
||||
self.abort_request(rid, exception=exc)
|
||||
|
||||
def process_request_output(self,
|
||||
request_output: Union[RequestOutput,
|
||||
PoolingRequestOutput],
|
||||
request_output: RequestOutput,
|
||||
*,
|
||||
verbose: bool = False) -> None:
|
||||
"""Process a request output from the engine."""
|
||||
@@ -261,9 +257,7 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def step_async(
|
||||
self, virtual_engine: int
|
||||
) -> List[Union[RequestOutput, PoolingRequestOutput]]:
|
||||
async def step_async(self, virtual_engine: int) -> List[RequestOutput]:
|
||||
"""Performs one decoding iteration and returns newly generated results.
|
||||
The workers are ran asynchronously if possible.
|
||||
|
||||
@@ -405,7 +399,7 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
params: SamplingParams,
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
@@ -779,14 +773,14 @@ class AsyncLLMEngine(EngineClient):
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
params: SamplingParams,
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
priority: int = 0,
|
||||
data_parallel_rank: Optional[int] = None,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
|
||||
) -> AsyncGenerator[RequestOutput, None]:
|
||||
if not self.is_running:
|
||||
if self.start_engine_loop:
|
||||
self.start_background_loop()
|
||||
@@ -908,7 +902,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
await self.abort(request_id)
|
||||
raise
|
||||
|
||||
async def encode(
|
||||
def encode(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
pooling_params: PoolingParams,
|
||||
@@ -918,85 +912,8 @@ class AsyncLLMEngine(EngineClient):
|
||||
priority: int = 0,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||
"""Generate outputs for a request from a pooling model.
|
||||
|
||||
Generate outputs for a request. This method is a coroutine. It adds the
|
||||
request into the waiting queue of the LLMEngine and streams the outputs
|
||||
from the LLMEngine to the caller.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to the LLM. See
|
||||
[`PromptType`][vllm.inputs.PromptType] for more details about
|
||||
the format of each input.
|
||||
pooling_params: The pooling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
trace_headers: OpenTelemetry trace headers.
|
||||
priority: The priority of the request.
|
||||
Only applicable with priority scheduling.
|
||||
|
||||
Yields:
|
||||
The output `PoolingRequestOutput` objects from the LLMEngine
|
||||
for the request.
|
||||
|
||||
Details:
|
||||
- If the engine is not running, start the background loop,
|
||||
which iteratively invokes
|
||||
[`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
|
||||
to process the waiting requests.
|
||||
- Add the request to the engine's `RequestTracker`.
|
||||
On the next background loop, this request will be sent to
|
||||
the underlying engine.
|
||||
Also, a corresponding `AsyncStream` will be created.
|
||||
- Wait for the request outputs from `AsyncStream` and yield them.
|
||||
|
||||
Example:
|
||||
```
|
||||
# Please refer to entrypoints/api_server.py for
|
||||
# the complete example.
|
||||
|
||||
# initialize the engine and the example input
|
||||
# note that engine_args here is AsyncEngineArgs instance
|
||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
example_input = {
|
||||
"input": "What is LLM?",
|
||||
"request_id": 0,
|
||||
}
|
||||
|
||||
# start the generation
|
||||
results_generator = engine.encode(
|
||||
example_input["input"],
|
||||
PoolingParams(),
|
||||
example_input["request_id"])
|
||||
|
||||
# get the results
|
||||
final_output = None
|
||||
async for request_output in results_generator:
|
||||
if await request.is_disconnected():
|
||||
# Abort the request if the client disconnects.
|
||||
await engine.abort(request_id)
|
||||
# Return or raise an error
|
||||
...
|
||||
final_output = request_output
|
||||
|
||||
# Process and return the final output
|
||||
...
|
||||
```
|
||||
"""
|
||||
try:
|
||||
async for output in await self.add_request(
|
||||
request_id,
|
||||
prompt,
|
||||
pooling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=priority,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
):
|
||||
yield LLMEngine.validate_output(output, PoolingRequestOutput)
|
||||
except asyncio.CancelledError:
|
||||
await self.abort(request_id)
|
||||
raise
|
||||
raise NotImplementedError(
|
||||
"Pooling models are not supported in vLLM V0")
|
||||
|
||||
async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
|
||||
"""Abort a request.
|
||||
@@ -1104,8 +1021,8 @@ class AsyncLLMEngine(EngineClient):
|
||||
async def is_sleeping(self) -> bool:
|
||||
return self.engine.is_sleeping()
|
||||
|
||||
async def add_lora(self, lora_request: LoRARequest) -> None:
|
||||
self.engine.add_lora(lora_request)
|
||||
async def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.engine.add_lora(lora_request)
|
||||
|
||||
async def collective_rpc(self,
|
||||
method: str,
|
||||
|
||||
Reference in New Issue
Block a user