From fdd6f2ad58b113fe0fdc3fd9998e63d6064b5f16 Mon Sep 17 00:00:00 2001 From: Reagan Lee <96998476+reaganjlee@users.noreply.github.com> Date: Tue, 10 Feb 2026 11:44:31 -0800 Subject: [PATCH] Convert online APIs to use Renderer (#34084) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Reagan Lee <“reaganjlee@gmail.com”> Co-authored-by: Reagan Lee <“reaganjlee@gmail.com”> --- .../openai/speech_to_text/speech_to_text.py | 26 +++++++++++++++---- vllm/entrypoints/serve/disagg/serving.py | 16 +++++++++--- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py index 454359ffd..8d8f0e6b7 100644 --- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py @@ -471,15 +471,31 @@ class OpenAISpeechToText(OpenAIServing): lora_request=lora_request, ) - list_result_generator = [ - self.engine_client.generate( + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + list_result_generator = [] + for i, prompt in enumerate(prompts): + request_id_item = f"{request_id}_{i}" + engine_request = self.input_processor.process_inputs( + request_id_item, prompt, sampling_params, - f"{request_id}_{i}", lora_request=lora_request, + trace_headers=trace_headers, + priority=0, + ) + list_result_generator.append( + self.engine_client.generate( + engine_request, + sampling_params, + request_id_item, + lora_request=lora_request, + ) ) - for i, prompt in enumerate(prompts) - ] except ValueError as e: return self.create_error_response(e) diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py index 0e61f5ec0..81fab153e 100644 --- a/vllm/entrypoints/serve/disagg/serving.py +++ b/vllm/entrypoints/serve/disagg/serving.py @@ -99,8 +99,6 @@ class ServingTokens(OpenAIServing): if raw_request: raw_request.state.request_metadata = request_metadata - # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is - # completed engine_prompts = await self._preprocess_completion( request, prompt_input=request.token_ids, @@ -132,16 +130,26 @@ class ServingTokens(OpenAIServing): tok_params = request.build_tok_params(self.model_config) tokenization_kwargs = tok_params.get_encode_kwargs() - result_generator = self.engine_client.generate( + engine_request = self.input_processor.process_inputs( + request_id, engine_prompt, sampling_params, - request_id, lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, trace_headers=trace_headers, priority=request.priority, ) + result_generator = self.engine_client.generate( + engine_request, + sampling_params, + request_id, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + tokenization_kwargs=tokenization_kwargs, + ) + except ValueError as e: return self.create_error_response(str(e))