[V0 Deprecation] Remove pooling model support in V0 (#23434)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-29 04:04:02 -03:00
parent 934bebf192
commit 2554b27baa
38 changed files with 99 additions and 808 deletions
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -3,7 +3,7 @@

 import asyncio
 from abc import ABC, abstractmethod
-from typing import AsyncGenerator, Iterable, Mapping, Optional, Union
+from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union

 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
@@ -224,6 +224,7 @@ class EngineClient(ABC):
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
        """Generate outputs for a request from a pooling model."""
        ...
@@ -320,7 +321,7 @@ class EngineClient(ABC):
        ...

    @abstractmethod
-    async def add_lora(self, lora_request: LoRARequest) -> None:
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
        """Load a new LoRA adapter into the engine for future requests."""
        ...