From e54ee3ea33bb8554eb6fa2eedb7007ae101197cf Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 29 Dec 2025 18:42:45 -0800
Subject: [PATCH] [Core] Deduplicate generate/encode logic in `AsyncLLM`
 (#31510)

Signed-off-by: njhill <nickhill123@gmail.com>
---
 vllm/v1/engine/async_llm.py | 80 +++++++++++++++----------------------
 1 file changed, 32 insertions(+), 48 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dbaebc190..1c93d63fe 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -281,6 +281,25 @@ class AsyncLLM(EngineClient):
 
         is_pooling = isinstance(params, PoolingParams)
 
+        if (
+            self.vllm_config.cache_config.kv_sharing_fast_prefill
+            and not is_pooling
+            and params.prompt_logprobs
+        ):
+            raise ValueError(
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, please disable it when the requests need "
+                "prompt logprobs"
+            )
+
+        if tokenization_kwargs is None:
+            tokenization_kwargs = {}
+        _validate_truncation_size(
+            self.model_config.max_model_len,
+            params.truncate_prompt_tokens,
+            tokenization_kwargs,
+        )
+
         # Convert Input --> Request.
         if isinstance(prompt, EngineCoreRequest):
             request = prompt
@@ -291,7 +310,10 @@ class AsyncLLM(EngineClient):
                     "latter will be used, and the former will be ignored."
                 )
         else:
-            assert prompt_text is None
+            if prompt_text is not None:
+                raise ValueError(
+                    "should only provide prompt_text with EngineCoreRequest"
+                )
             request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
@@ -310,6 +332,15 @@ class AsyncLLM(EngineClient):
 
         self.input_processor.assign_request_id(request)
 
+        # We start the output_handler on the first call to add_request() so
+        # we can call __init__ before the event loop, which enables us
+        # to handle startup failure gracefully in the OpenAI server.
+        self._run_output_handler()
+
+        # Respect pause state before accepting new requests.
+        async with self._pause_cond:
+            await self._pause_cond.wait_for(lambda: not self._paused)
+
         # Create a new output collector for the request.
         queue = RequestOutputCollector(params.output_kind, request.request_id)
 
@@ -385,37 +416,8 @@ class AsyncLLM(EngineClient):
         returning the RequestOutput back to the caller.
         """
 
-        if (
-            self.vllm_config.cache_config.kv_sharing_fast_prefill
-            and sampling_params.prompt_logprobs
-        ):
-            raise ValueError(
-                "--kv-sharing-fast-prefill produces incorrect logprobs for "
-                "prompt tokens, please disable it when the requests need "
-                "prompt logprobs"
-            )
-
         q: RequestOutputCollector | None = None
         try:
-            # We start the output_handler on the first call to generate() so
-            # we can call __init__ before the event loop, which enables us
-            # to handle startup failure gracefully in the OpenAI server.
-            self._run_output_handler()
-
-            # Wait until generation is resumed if the engine is paused.
-            async with self._pause_cond:
-                await self._pause_cond.wait_for(lambda: not self._paused)
-
-            if tokenization_kwargs is None:
-                tokenization_kwargs = {}
-                truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
-
-                _validate_truncation_size(
-                    self.model_config.max_model_len,
-                    truncate_prompt_tokens,
-                    tokenization_kwargs,
-                )
-
             q = await self.add_request(
                 request_id,
                 prompt,
@@ -639,18 +641,6 @@ class AsyncLLM(EngineClient):
 
         q: RequestOutputCollector | None = None
         try:
-            # We start the output_handler on the first call to generate() so
-            # we can call __init__ before the event loop, which enables us
-            # to handle startup failure gracefully in the OpenAI server.
-            self._run_output_handler()
-
-            # Respect pause state before accepting new requests.
-            async with self._pause_cond:
-                await self._pause_cond.wait_for(lambda: not self._paused)
-
-            if tokenization_kwargs is None:
-                tokenization_kwargs = {}
-
             if truncate_prompt_tokens is not None:
                 warnings.warn(
                     "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
@@ -660,12 +650,6 @@ class AsyncLLM(EngineClient):
                     stacklevel=2,
                 )
 
-            _validate_truncation_size(
-                self.model_config.max_model_len,
-                pooling_params.truncate_prompt_tokens,
-                tokenization_kwargs,
-            )
-
             q = await self.add_request(
                 request_id,
                 prompt,