[V0 Deprecation] Remove pooling model support in V0 (#23434)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-29 04:04:02 -03:00
parent 934bebf192
commit 2554b27baa
38 changed files with 99 additions and 808 deletions
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -72,8 +72,8 @@ STOP_ITERATION = Exception()  # Sentinel


 class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
+    """A stream of RequestOutputs for a request that can be iterated over
+    asynchronously via an async generator."""

    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
        self.request_id = request_id
@@ -81,8 +81,7 @@ class AsyncStream:
        self._queue: asyncio.Queue = asyncio.Queue()
        self._finished = False

-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
+    def put(self, item: Union[RequestOutput, Exception]) -> None:
        if not self._finished:
            self._queue.put_nowait(item)

@@ -99,9 +98,7 @@ class AsyncStream:
    def finished(self) -> bool:
        return self._finished

-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    async def generator(self) -> AsyncGenerator[RequestOutput, None]:
        try:
            while True:
                result = await self._queue.get()
@@ -151,8 +148,7 @@ class RequestTracker:
                self.abort_request(rid, exception=exc)

    def process_request_output(self,
-                               request_output: Union[RequestOutput,
-                                                     PoolingRequestOutput],
+                               request_output: RequestOutput,
                               *,
                               verbose: bool = False) -> None:
        """Process a request output from the engine."""
@@ -261,9 +257,7 @@ class _AsyncLLMEngine(LLMEngine):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

-    async def step_async(
-        self, virtual_engine: int
-    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+    async def step_async(self, virtual_engine: int) -> List[RequestOutput]:
        """Performs one decoding iteration and returns newly generated results.
        The workers are ran asynchronously if possible.

@@ -405,7 +399,7 @@ class _AsyncLLMEngine(LLMEngine):
        self,
        request_id: str,
        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
@@ -779,14 +773,14 @@ class AsyncLLMEngine(EngineClient):
        self,
        request_id: str,
        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
        data_parallel_rank: Optional[int] = None,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> AsyncGenerator[RequestOutput, None]:
        if not self.is_running:
            if self.start_engine_loop:
                self.start_background_loop()
@@ -908,7 +902,7 @@ class AsyncLLMEngine(EngineClient):
            await self.abort(request_id)
            raise

-    async def encode(
+    def encode(
        self,
        prompt: PromptType,
        pooling_params: PoolingParams,
@@ -918,85 +912,8 @@ class AsyncLLMEngine(EngineClient):
        priority: int = 0,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            pooling_params: The pooling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-
-        Yields:
-            The output `PoolingRequestOutput` objects from the LLMEngine
-            for the request.
-
-        Details:
-            - If the engine is not running, start the background loop,
-                which iteratively invokes
-                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
-                to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-                On the next background loop, this request will be sent to
-                the underlying engine.
-                Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
-
-        Example:
-        ```
-        # Please refer to entrypoints/api_server.py for
-        # the complete example.
-
-        # initialize the engine and the example input
-        # note that engine_args here is AsyncEngineArgs instance
-        engine = AsyncLLMEngine.from_engine_args(engine_args)
-        example_input = {
-            "input": "What is LLM?",
-            "request_id": 0,
-        }
-
-        # start the generation
-        results_generator = engine.encode(
-        example_input["input"],
-        PoolingParams(),
-        example_input["request_id"])
-
-        # get the results
-        final_output = None
-        async for request_output in results_generator:
-            if await request.is_disconnected():
-                # Abort the request if the client disconnects.
-                await engine.abort(request_id)
-                # Return or raise an error
-                ...
-            final_output = request_output
-
-        # Process and return the final output
-        ...
-        ```
-        """
-        try:
-            async for output in await self.add_request(
-                    request_id,
-                    prompt,
-                    pooling_params,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                    tokenization_kwargs=tokenization_kwargs,
-            ):
-                yield LLMEngine.validate_output(output, PoolingRequestOutput)
-        except asyncio.CancelledError:
-            await self.abort(request_id)
-            raise
+        raise NotImplementedError(
+            "Pooling models are not supported in vLLM V0")

    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
        """Abort a request.
@@ -1104,8 +1021,8 @@ class AsyncLLMEngine(EngineClient):
    async def is_sleeping(self) -> bool:
        return self.engine.is_sleeping()

-    async def add_lora(self, lora_request: LoRARequest) -> None:
-        self.engine.add_lora(lora_request)
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.engine.add_lora(lora_request)

    async def collective_rpc(self,
                             method: str,