[V1] Avoid redundant input processing in n>1 case (#14985)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -182,7 +182,6 @@ class InputPreprocessor:
|
||||
def _tokenize_prompt(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> list[int]:
|
||||
"""
|
||||
@@ -202,15 +201,13 @@ class InputPreprocessor:
|
||||
"do_lower_case", False)):
|
||||
prompt = prompt.lower()
|
||||
|
||||
return tokenizer.encode(request_id=request_id,
|
||||
prompt=prompt,
|
||||
return tokenizer.encode(prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
|
||||
async def _tokenize_prompt_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> list[int]:
|
||||
"""Async version of :meth:`_tokenize_prompt`."""
|
||||
@@ -222,7 +219,6 @@ class InputPreprocessor:
|
||||
# appending an EOS token to the prompt which disrupts generation.
|
||||
add_special_tokens = False
|
||||
return await tokenizer.encode_async(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
@@ -309,7 +305,6 @@ class InputPreprocessor:
|
||||
def _prompt_to_llm_inputs(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
@@ -318,7 +313,6 @@ class InputPreprocessor:
|
||||
|
||||
Arguments:
|
||||
|
||||
* request_id
|
||||
* prompt: single encoder or decoder input prompt
|
||||
* lora_request: this is only valid for decoder prompts
|
||||
* return_mm_hashes: whether to return multimodal hashes
|
||||
@@ -333,7 +327,6 @@ class InputPreprocessor:
|
||||
prompt_text = parsed["content"]
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@@ -384,7 +377,6 @@ class InputPreprocessor:
|
||||
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@@ -400,7 +392,6 @@ class InputPreprocessor:
|
||||
async def _prompt_to_llm_inputs_async(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
@@ -411,7 +402,6 @@ class InputPreprocessor:
|
||||
prompt_text = parsed["content"]
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@@ -460,7 +450,6 @@ class InputPreprocessor:
|
||||
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@@ -560,7 +549,6 @@ class InputPreprocessor:
|
||||
def _process_encoder_decoder_prompt(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderInputs:
|
||||
"""
|
||||
For encoder/decoder models only:
|
||||
@@ -587,7 +575,6 @@ class InputPreprocessor:
|
||||
Arguments:
|
||||
|
||||
* prompt: an input prompt
|
||||
* request_id
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -598,16 +585,11 @@ class InputPreprocessor:
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
encoder_inputs = self._prompt_to_llm_inputs(
|
||||
prompt["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
prompt["encoder_prompt"])
|
||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
||||
decoder_inputs = None
|
||||
else:
|
||||
decoder_inputs = self._prompt_to_llm_inputs(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
|
||||
# For multimodal model, override decoder prompt from processor
|
||||
# with explicit decoder prompt.
|
||||
if self.model_config.is_multimodal_model and (
|
||||
@@ -616,10 +598,7 @@ class InputPreprocessor:
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
encoder_inputs, decoder_inputs))
|
||||
else:
|
||||
inputs = self._prompt_to_llm_inputs(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
inputs = self._prompt_to_llm_inputs(prompt)
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
# Encoder-Decoder Multimodal model
|
||||
@@ -636,7 +615,6 @@ class InputPreprocessor:
|
||||
async def _process_encoder_decoder_prompt_async(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderInputs:
|
||||
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
|
||||
encoder_inputs: SingletonInputs
|
||||
@@ -644,18 +622,13 @@ class InputPreprocessor:
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
encoder_task = self._prompt_to_llm_inputs_async(
|
||||
prompt["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
prompt["encoder_prompt"])
|
||||
|
||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
||||
encoder_inputs = await encoder_task
|
||||
decoder_inputs = None
|
||||
else:
|
||||
decoder_task = self._prompt_to_llm_inputs_async(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
|
||||
|
||||
encoder_inputs, decoder_inputs = await asyncio.gather(
|
||||
encoder_task, decoder_task)
|
||||
@@ -668,10 +641,7 @@ class InputPreprocessor:
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
encoder_inputs, decoder_inputs))
|
||||
else:
|
||||
inputs = await self._prompt_to_llm_inputs_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
inputs = await self._prompt_to_llm_inputs_async(prompt)
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
# Encoder-Decoder Multimodal model
|
||||
@@ -704,7 +674,6 @@ class InputPreprocessor:
|
||||
def _process_decoder_only_prompt(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@@ -716,7 +685,6 @@ class InputPreprocessor:
|
||||
Arguments:
|
||||
|
||||
* prompt: input prompt
|
||||
* request_id
|
||||
* lora_request
|
||||
* prompt_adapter_request
|
||||
* return_mm_hashes
|
||||
@@ -728,7 +696,6 @@ class InputPreprocessor:
|
||||
|
||||
prompt_comps = self._prompt_to_llm_inputs(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@@ -741,7 +708,6 @@ class InputPreprocessor:
|
||||
async def _process_decoder_only_prompt_async(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@@ -749,7 +715,6 @@ class InputPreprocessor:
|
||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
||||
prompt_comps = await self._prompt_to_llm_inputs_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@@ -762,7 +727,6 @@ class InputPreprocessor:
|
||||
def preprocess(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@@ -774,10 +738,7 @@ class InputPreprocessor:
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return self._process_encoder_decoder_prompt(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
return self._process_encoder_decoder_prompt(prompt)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
@@ -786,7 +747,6 @@ class InputPreprocessor:
|
||||
# Decoder-only operation
|
||||
return self._process_decoder_only_prompt(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
@@ -795,7 +755,6 @@ class InputPreprocessor:
|
||||
async def preprocess_async(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@@ -807,10 +766,7 @@ class InputPreprocessor:
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return await self._process_encoder_decoder_prompt_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
return await self._process_encoder_decoder_prompt_async(prompt)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
@@ -819,7 +775,6 @@ class InputPreprocessor:
|
||||
# Decoder-only operation
|
||||
return await self._process_decoder_only_prompt_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
|
||||
Reference in New Issue
Block a user