[V0 Deprecation] Remove pooling model support in V0 (#23434)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Maximilien de Bayser
2025-08-29 04:04:02 -03:00
committed by GitHub
parent 934bebf192
commit 2554b27baa
38 changed files with 99 additions and 808 deletions

View File

@@ -3,7 +3,7 @@
import asyncio
from abc import ABC, abstractmethod
from typing import AsyncGenerator, Iterable, Mapping, Optional, Union
from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
@@ -224,6 +224,7 @@ class EngineClient(ABC):
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model."""
...
@@ -320,7 +321,7 @@ class EngineClient(ABC):
...
@abstractmethod
async def add_lora(self, lora_request: LoRARequest) -> None:
async def add_lora(self, lora_request: LoRARequest) -> bool:
"""Load a new LoRA adapter into the engine for future requests."""
...