Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -21,27 +21,25 @@ from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.structured_output.backend_guidance import (
|
||||
validate_guidance_grammar)
|
||||
from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
|
||||
from vllm.v1.structured_output.backend_lm_format_enforcer import (
|
||||
validate_structured_output_request_lm_format_enforcer)
|
||||
validate_structured_output_request_lm_format_enforcer,
|
||||
)
|
||||
from vllm.v1.structured_output.backend_outlines import (
|
||||
validate_structured_output_request_outlines)
|
||||
from vllm.v1.structured_output.backend_xgrammar import (
|
||||
validate_xgrammar_grammar)
|
||||
validate_structured_output_request_outlines,
|
||||
)
|
||||
from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Processor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
tokenizer: AnyTokenizer,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
):
|
||||
|
||||
self.vllm_config = vllm_config
|
||||
self.model_config = vllm_config.model_config
|
||||
self.cache_config = vllm_config.cache_config
|
||||
@@ -49,12 +47,10 @@ class Processor:
|
||||
self.structured_outputs_config = vllm_config.structured_outputs_config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.generation_config_fields = (
|
||||
self.model_config.try_get_generation_config())
|
||||
self.generation_config_fields = self.model_config.try_get_generation_config()
|
||||
|
||||
self.mm_registry = mm_registry
|
||||
self.mm_processor_cache = processor_cache_from_config(
|
||||
vllm_config, mm_registry)
|
||||
self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
|
||||
|
||||
self.input_preprocessor = InputPreprocessor(
|
||||
self.model_config,
|
||||
@@ -79,7 +75,8 @@ class Processor:
|
||||
if num_logprobs > max_logprobs:
|
||||
raise ValueError(
|
||||
f"Requested sample logprobs of {num_logprobs}, "
|
||||
f"which is greater than max allowed: {max_logprobs}")
|
||||
f"which is greater than max allowed: {max_logprobs}"
|
||||
)
|
||||
|
||||
# Validate prompt logprobs.
|
||||
if params.prompt_logprobs:
|
||||
@@ -89,7 +86,8 @@ class Processor:
|
||||
if num_prompt_logprobs > max_logprobs:
|
||||
raise ValueError(
|
||||
f"Requested prompt logprobs of {num_prompt_logprobs}, "
|
||||
f"which is greater than max allowed: {max_logprobs}")
|
||||
f"which is greater than max allowed: {max_logprobs}"
|
||||
)
|
||||
|
||||
def _validate_sampling_params(
|
||||
self,
|
||||
@@ -108,8 +106,7 @@ class Processor:
|
||||
return
|
||||
vocab_size = len(self.tokenizer)
|
||||
if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
|
||||
raise ValueError(
|
||||
"allowed_token_ids contains out-of-vocab token id!")
|
||||
raise ValueError("allowed_token_ids contains out-of-vocab token id!")
|
||||
|
||||
def _validate_logit_bias(
|
||||
self,
|
||||
@@ -129,7 +126,8 @@ class Processor:
|
||||
if invalid_token_ids:
|
||||
raise ValueError(
|
||||
f"token_id(s) {invalid_token_ids} in logit_bias contain "
|
||||
f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
|
||||
f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
|
||||
)
|
||||
|
||||
def _validate_supported_sampling_params(
|
||||
self,
|
||||
@@ -140,8 +138,9 @@ class Processor:
|
||||
raise ValueError("vLLM V1 does not yet support best_of.")
|
||||
# Logits processors not supported.
|
||||
if params.logits_processors:
|
||||
raise ValueError("vLLM V1 does not support per request "
|
||||
"user provided logits processors.")
|
||||
raise ValueError(
|
||||
"vLLM V1 does not support per request user provided logits processors."
|
||||
)
|
||||
|
||||
def _validate_params(
|
||||
self,
|
||||
@@ -178,18 +177,23 @@ class Processor:
|
||||
for modality, items in mm_data.items():
|
||||
if modality in mm_uuids:
|
||||
data_len = len(items) if isinstance(items, list) else 1
|
||||
uuid_len = len(mm_uuids[modality]) if isinstance(
|
||||
mm_uuids[modality], list) else 1
|
||||
uuid_len = (
|
||||
len(mm_uuids[modality])
|
||||
if isinstance(mm_uuids[modality], list)
|
||||
else 1
|
||||
)
|
||||
if uuid_len != data_len:
|
||||
raise ValueError(
|
||||
f"multi_modal_uuids for modality '{modality}' "
|
||||
"must have same length as data: got "
|
||||
f"{uuid_len} uuids vs "
|
||||
f"{data_len} items.")
|
||||
f"{data_len} items."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"multi_modal_uuids for modality '{modality}' must "
|
||||
"be provided if multi_modal_data is provided.")
|
||||
"be provided if multi_modal_data is provided."
|
||||
)
|
||||
|
||||
# Handle explicit encoder/decoder prompts or singleton prompt
|
||||
if isinstance(prompt, dict) and "encoder_prompt" in prompt:
|
||||
@@ -208,8 +212,9 @@ class Processor:
|
||||
|
||||
# LoRA request passed in while LoRA is not enabled
|
||||
if not self.lora_config:
|
||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||
"not enabled!")
|
||||
raise ValueError(
|
||||
f"Got lora_request {lora_request} but LoRA is not enabled!"
|
||||
)
|
||||
|
||||
if self.tokenizer is not None:
|
||||
logger.warning_once(
|
||||
@@ -217,7 +222,8 @@ class Processor:
|
||||
"tokenizers for different LoRAs. By default, vLLM uses base "
|
||||
"model's tokenizer. If you are using a LoRA "
|
||||
"with its own tokenizer, consider specifying `--tokenizer "
|
||||
"[lora_path]` to use the LoRA tokenizer.")
|
||||
"[lora_path]` to use the LoRA tokenizer."
|
||||
)
|
||||
|
||||
def _validate_structured_output(self, params: SamplingParams) -> None:
|
||||
if not params.structured_outputs or not self.structured_outputs_config:
|
||||
@@ -235,20 +241,23 @@ class Processor:
|
||||
# to a specific backend based on `auto` behavior in a previous
|
||||
# request. We remember that it was set as a result of `auto`
|
||||
# using the `_backend_was_auto` field set in the params.
|
||||
if (backend != _backend
|
||||
and not (backend == "auto"
|
||||
and params.structured_outputs._backend_was_auto)):
|
||||
if backend != _backend and not (
|
||||
backend == "auto" and params.structured_outputs._backend_was_auto
|
||||
):
|
||||
raise ValueError(
|
||||
"Request-level structured output backend selection is not "
|
||||
f"supported. The request specified '{_backend}', but vLLM "
|
||||
f"was initialised with '{backend}'. This error can be "
|
||||
"resolved by removing '_backend' from the request.")
|
||||
"resolved by removing '_backend' from the request."
|
||||
)
|
||||
else:
|
||||
params.structured_outputs._backend = backend
|
||||
|
||||
# Request content validation
|
||||
if (isinstance(params.structured_outputs.choice, list)
|
||||
and not params.structured_outputs.choice):
|
||||
if (
|
||||
isinstance(params.structured_outputs.choice, list)
|
||||
and not params.structured_outputs.choice
|
||||
):
|
||||
# It is invalid for choice to be an empty list
|
||||
raise ValueError(
|
||||
f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501
|
||||
@@ -318,9 +327,7 @@ class Processor:
|
||||
mm_uuids: MultiModalUUIDDict = {}
|
||||
for modality, data in mm_data.items():
|
||||
n = len(data) if isinstance(data, list) else 1
|
||||
mm_uuids[modality] = [
|
||||
f"{request_id}-{modality}-{i}" for i in range(n)
|
||||
]
|
||||
mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
|
||||
return mm_uuids
|
||||
|
||||
def process_inputs(
|
||||
@@ -339,10 +346,13 @@ class Processor:
|
||||
self._validate_params(params)
|
||||
|
||||
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
|
||||
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
|
||||
data_parallel_size):
|
||||
raise ValueError(f"data_parallel_rank {data_parallel_rank} "
|
||||
f"is out of range [0, {data_parallel_size}).")
|
||||
if data_parallel_rank is not None and not (
|
||||
0 <= data_parallel_rank < data_parallel_size
|
||||
):
|
||||
raise ValueError(
|
||||
f"data_parallel_rank {data_parallel_rank} "
|
||||
f"is out of range [0, {data_parallel_size})."
|
||||
)
|
||||
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
@@ -355,9 +365,11 @@ class Processor:
|
||||
# reused across requests, therefore identifying multimodal data items
|
||||
# by their content is no longer necessary, and we create uuids with
|
||||
# request id-modality-index as multimodal hash overrides.
|
||||
if (self.model_config.multimodal_config and
|
||||
self.model_config.multimodal_config.mm_processor_cache_gb == 0
|
||||
and not self.cache_config.enable_prefix_caching):
|
||||
if (
|
||||
self.model_config.multimodal_config
|
||||
and self.model_config.multimodal_config.mm_processor_cache_gb == 0
|
||||
and not self.cache_config.enable_prefix_caching
|
||||
):
|
||||
mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
|
||||
else:
|
||||
# Otherwise, use user-provided uuids as multimodal hash overrides
|
||||
@@ -378,6 +390,7 @@ class Processor:
|
||||
mm_uuids=mm_uuids,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
current_platform.validate_request(
|
||||
prompt=prompt,
|
||||
params=params,
|
||||
@@ -393,10 +406,16 @@ class Processor:
|
||||
# discriminated unions of TypedDicts, because of how it handles
|
||||
# inheritance of TypedDict. If we explicitly extract the items we want
|
||||
# we can avoid type errors from using `dict.get` later in the method.
|
||||
prompt_token_ids = decoder_inputs[
|
||||
"prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
|
||||
prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
|
||||
"type"] == "embeds" else None
|
||||
prompt_token_ids = (
|
||||
decoder_inputs["prompt_token_ids"]
|
||||
if decoder_inputs["type"] != "embeds"
|
||||
else None
|
||||
)
|
||||
prompt_embeds = (
|
||||
decoder_inputs["prompt_embeds"]
|
||||
if decoder_inputs["type"] == "embeds"
|
||||
else None
|
||||
)
|
||||
|
||||
sampling_params = None
|
||||
pooling_params = None
|
||||
@@ -406,11 +425,12 @@ class Processor:
|
||||
# If unset max tokens, then generate up to the max_model_len.
|
||||
if sampling_params.max_tokens is None:
|
||||
seq_len = length_from_prompt_token_ids_or_embeds(
|
||||
prompt_token_ids, prompt_embeds)
|
||||
sampling_params.max_tokens = \
|
||||
self.model_config.max_model_len - seq_len
|
||||
prompt_token_ids, prompt_embeds
|
||||
)
|
||||
sampling_params.max_tokens = self.model_config.max_model_len - seq_len
|
||||
sampling_params.update_from_generation_config(
|
||||
self.generation_config_fields, eos_token_id)
|
||||
self.generation_config_fields, eos_token_id
|
||||
)
|
||||
if self.tokenizer is not None:
|
||||
sampling_params.update_from_tokenizer(self.tokenizer)
|
||||
else:
|
||||
@@ -436,7 +456,9 @@ class Processor:
|
||||
data=decoder_mm_inputs[modality][idx],
|
||||
modality=modality,
|
||||
identifier=decoder_mm_hashes[modality][idx],
|
||||
mm_position=decoder_mm_positions[modality][idx]))
|
||||
mm_position=decoder_mm_positions[modality][idx],
|
||||
)
|
||||
)
|
||||
|
||||
return EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
@@ -454,8 +476,9 @@ class Processor:
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
def _validate_model_inputs(self, encoder_inputs: Optional[SingletonInputs],
|
||||
decoder_inputs: SingletonInputs):
|
||||
def _validate_model_inputs(
|
||||
self, encoder_inputs: Optional[SingletonInputs], decoder_inputs: SingletonInputs
|
||||
):
|
||||
if encoder_inputs is not None:
|
||||
self._validate_model_input(encoder_inputs, prompt_type="encoder")
|
||||
|
||||
@@ -469,12 +492,17 @@ class Processor:
|
||||
):
|
||||
model_config = self.model_config
|
||||
|
||||
prompt_ids = None if prompt_inputs[
|
||||
"type"] == "embeds" else prompt_inputs["prompt_token_ids"]
|
||||
prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[
|
||||
"type"] == "embeds" else None
|
||||
prompt_len = length_from_prompt_token_ids_or_embeds(
|
||||
prompt_ids, prompt_embeds)
|
||||
prompt_ids = (
|
||||
None
|
||||
if prompt_inputs["type"] == "embeds"
|
||||
else prompt_inputs["prompt_token_ids"]
|
||||
)
|
||||
prompt_embeds = (
|
||||
prompt_inputs["prompt_embeds"]
|
||||
if prompt_inputs["type"] == "embeds"
|
||||
else None
|
||||
)
|
||||
prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
|
||||
if not prompt_ids:
|
||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||
pass # Mllama may have empty encoder inputs for text-only data
|
||||
@@ -499,10 +527,10 @@ class Processor:
|
||||
|
||||
# Here we take the max of the two to determine if a token id is
|
||||
# truly out-of-vocabulary.
|
||||
if max_input_id > max(tokenizer.max_token_id,
|
||||
self.model_config.get_vocab_size() - 1):
|
||||
raise ValueError(
|
||||
f"Token id {max_input_id} is out of vocabulary")
|
||||
if max_input_id > max(
|
||||
tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
|
||||
):
|
||||
raise ValueError(f"Token id {max_input_id} is out of vocabulary")
|
||||
|
||||
max_prompt_len = self.model_config.max_model_len
|
||||
if prompt_len > max_prompt_len:
|
||||
@@ -522,16 +550,19 @@ class Processor:
|
||||
"Make sure that `max_model_len` is no smaller than the "
|
||||
"number of text tokens plus multimodal tokens. For image "
|
||||
"inputs, the number of image tokens depends on the number "
|
||||
"of images, and possibly their aspect ratios as well.")
|
||||
"of images, and possibly their aspect ratios as well."
|
||||
)
|
||||
else:
|
||||
suggestion = (
|
||||
"Make sure that `max_model_len` is no smaller than the "
|
||||
"number of text tokens.")
|
||||
"number of text tokens."
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
f"The {prompt_type} prompt (length {prompt_len}) is "
|
||||
f"longer than the maximum model length of {max_prompt_len}. "
|
||||
f"{suggestion}")
|
||||
f"{suggestion}"
|
||||
)
|
||||
|
||||
# TODO: Find out how many placeholder tokens are there so we can
|
||||
# check that chunked prefill does not truncate them
|
||||
|
||||
Reference in New Issue
Block a user