[Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
2025-12-20 02:29:31 -08:00
parent ff2168bca3
commit 1501a4070e
2 changed files with 20 additions and 2 deletions
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -71,7 +71,11 @@ class EngineClient(ABC):
        truncate_prompt_tokens: int | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove this argument in v0.15.
+        """
        ...

    @abstractmethod