[V0 Deprecation] Remove pooling model support in V0 (#23434)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
committed by
GitHub
parent
934bebf192
commit
2554b27baa
@@ -3,7 +3,7 @@
|
||||
|
||||
import asyncio
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import AsyncGenerator, Iterable, Mapping, Optional, Union
|
||||
from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
|
||||
|
||||
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
|
||||
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
|
||||
@@ -224,6 +224,7 @@ class EngineClient(ABC):
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
priority: int = 0,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||
"""Generate outputs for a request from a pooling model."""
|
||||
...
|
||||
@@ -320,7 +321,7 @@ class EngineClient(ABC):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def add_lora(self, lora_request: LoRARequest) -> None:
|
||||
async def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
"""Load a new LoRA adapter into the engine for future requests."""
|
||||
...
|
||||
|
||||
|
||||
Reference in New Issue
Block a user