[V1] Avoid redundant input processing in n>1 case (#14985)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill
2025-03-20 22:24:10 -07:00
committed by GitHub
parent 7297941b38
commit da6ea29f7a
13 changed files with 85 additions and 145 deletions

View File

@@ -182,7 +182,6 @@ class InputPreprocessor:
def _tokenize_prompt(
self,
prompt: str,
request_id: str,
lora_request: Optional[LoRARequest],
) -> list[int]:
"""
@@ -202,15 +201,13 @@ class InputPreprocessor:
"do_lower_case", False)):
prompt = prompt.lower()
return tokenizer.encode(request_id=request_id,
prompt=prompt,
return tokenizer.encode(prompt=prompt,
lora_request=lora_request,
add_special_tokens=add_special_tokens)
async def _tokenize_prompt_async(
self,
prompt: str,
request_id: str,
lora_request: Optional[LoRARequest],
) -> list[int]:
"""Async version of :meth:`_tokenize_prompt`."""
@@ -222,7 +219,6 @@ class InputPreprocessor:
# appending an EOS token to the prompt which disrupts generation.
add_special_tokens = False
return await tokenizer.encode_async(
request_id=request_id,
prompt=prompt,
lora_request=lora_request,
add_special_tokens=add_special_tokens)
@@ -309,7 +305,6 @@ class InputPreprocessor:
def _prompt_to_llm_inputs(
self,
prompt: SingletonPrompt,
request_id: str,
lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> SingletonInputs:
@@ -318,7 +313,6 @@ class InputPreprocessor:
Arguments:
* request_id
* prompt: single encoder or decoder input prompt
* lora_request: this is only valid for decoder prompts
* return_mm_hashes: whether to return multimodal hashes
@@ -333,7 +327,6 @@ class InputPreprocessor:
prompt_text = parsed["content"]
prompt_token_ids = self._tokenize_prompt(
prompt_text,
request_id=request_id,
lora_request=lora_request,
)
@@ -384,7 +377,6 @@ class InputPreprocessor:
prompt_token_ids = self._tokenize_prompt(
prompt_text,
request_id=request_id,
lora_request=lora_request,
)
@@ -400,7 +392,6 @@ class InputPreprocessor:
async def _prompt_to_llm_inputs_async(
self,
prompt: SingletonPrompt,
request_id: str,
lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> SingletonInputs:
@@ -411,7 +402,6 @@ class InputPreprocessor:
prompt_text = parsed["content"]
prompt_token_ids = await self._tokenize_prompt_async(
prompt_text,
request_id=request_id,
lora_request=lora_request,
)
@@ -460,7 +450,6 @@ class InputPreprocessor:
prompt_token_ids = await self._tokenize_prompt_async(
prompt_text,
request_id=request_id,
lora_request=lora_request,
)
@@ -560,7 +549,6 @@ class InputPreprocessor:
def _process_encoder_decoder_prompt(
self,
prompt: PromptType,
request_id: str,
) -> EncoderDecoderInputs:
"""
For encoder/decoder models only:
@@ -587,7 +575,6 @@ class InputPreprocessor:
Arguments:
* prompt: an input prompt
* request_id
Returns:
@@ -598,16 +585,11 @@ class InputPreprocessor:
if is_explicit_encoder_decoder_prompt(prompt):
encoder_inputs = self._prompt_to_llm_inputs(
prompt["encoder_prompt"],
request_id=request_id,
)
prompt["encoder_prompt"])
if (decoder_input := prompt["decoder_prompt"]) is None:
decoder_inputs = None
else:
decoder_inputs = self._prompt_to_llm_inputs(
decoder_input,
request_id=request_id,
)
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
# For multimodal model, override decoder prompt from processor
# with explicit decoder prompt.
if self.model_config.is_multimodal_model and (
@@ -616,10 +598,7 @@ class InputPreprocessor:
self._separate_enc_dec_inputs_from_mm_processor_outputs(
encoder_inputs, decoder_inputs))
else:
inputs = self._prompt_to_llm_inputs(
prompt,
request_id=request_id,
)
inputs = self._prompt_to_llm_inputs(prompt)
if self.model_config.is_multimodal_model and (
self._can_process_multimodal()):
# Encoder-Decoder Multimodal model
@@ -636,7 +615,6 @@ class InputPreprocessor:
async def _process_encoder_decoder_prompt_async(
self,
prompt: PromptType,
request_id: str,
) -> EncoderDecoderInputs:
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
encoder_inputs: SingletonInputs
@@ -644,18 +622,13 @@ class InputPreprocessor:
if is_explicit_encoder_decoder_prompt(prompt):
encoder_task = self._prompt_to_llm_inputs_async(
prompt["encoder_prompt"],
request_id=request_id,
)
prompt["encoder_prompt"])
if (decoder_input := prompt["decoder_prompt"]) is None:
encoder_inputs = await encoder_task
decoder_inputs = None
else:
decoder_task = self._prompt_to_llm_inputs_async(
decoder_input,
request_id=request_id,
)
decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
encoder_inputs, decoder_inputs = await asyncio.gather(
encoder_task, decoder_task)
@@ -668,10 +641,7 @@ class InputPreprocessor:
self._separate_enc_dec_inputs_from_mm_processor_outputs(
encoder_inputs, decoder_inputs))
else:
inputs = await self._prompt_to_llm_inputs_async(
prompt,
request_id=request_id,
)
inputs = await self._prompt_to_llm_inputs_async(prompt)
if self.model_config.is_multimodal_model and (
self._can_process_multimodal()):
# Encoder-Decoder Multimodal model
@@ -704,7 +674,6 @@ class InputPreprocessor:
def _process_decoder_only_prompt(
self,
prompt: SingletonPrompt,
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
@@ -716,7 +685,6 @@ class InputPreprocessor:
Arguments:
* prompt: input prompt
* request_id
* lora_request
* prompt_adapter_request
* return_mm_hashes
@@ -728,7 +696,6 @@ class InputPreprocessor:
prompt_comps = self._prompt_to_llm_inputs(
prompt,
request_id=request_id,
lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
)
@@ -741,7 +708,6 @@ class InputPreprocessor:
async def _process_decoder_only_prompt_async(
self,
prompt: SingletonPrompt,
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
@@ -749,7 +715,6 @@ class InputPreprocessor:
"""Async version of :meth:`_process_decoder_only_prompt`."""
prompt_comps = await self._prompt_to_llm_inputs_async(
prompt,
request_id=request_id,
lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
)
@@ -762,7 +727,6 @@ class InputPreprocessor:
def preprocess(
self,
prompt: PromptType,
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
@@ -774,10 +738,7 @@ class InputPreprocessor:
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder
return self._process_encoder_decoder_prompt(
prompt,
request_id=request_id,
)
return self._process_encoder_decoder_prompt(prompt)
if is_explicit_encoder_decoder_prompt(prompt):
raise ValueError("Cannot pass encoder-decoder prompt "
@@ -786,7 +747,6 @@ class InputPreprocessor:
# Decoder-only operation
return self._process_decoder_only_prompt(
prompt,
request_id=request_id,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,
@@ -795,7 +755,6 @@ class InputPreprocessor:
async def preprocess_async(
self,
prompt: PromptType,
request_id: str,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
@@ -807,10 +766,7 @@ class InputPreprocessor:
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder
return await self._process_encoder_decoder_prompt_async(
prompt,
request_id=request_id,
)
return await self._process_encoder_decoder_prompt_async(prompt)
if is_explicit_encoder_decoder_prompt(prompt):
raise ValueError("Cannot pass encoder-decoder prompt "
@@ -819,7 +775,6 @@ class InputPreprocessor:
# Decoder-only operation
return await self._process_decoder_only_prompt_async(
prompt,
request_id=request_id,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,