Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -21,27 +21,25 @@ from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.structured_output.backend_guidance import (
validate_guidance_grammar)
from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
from vllm.v1.structured_output.backend_lm_format_enforcer import (
validate_structured_output_request_lm_format_enforcer)
validate_structured_output_request_lm_format_enforcer,
)
from vllm.v1.structured_output.backend_outlines import (
validate_structured_output_request_outlines)
from vllm.v1.structured_output.backend_xgrammar import (
validate_xgrammar_grammar)
validate_structured_output_request_outlines,
)
from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
logger = init_logger(__name__)
class Processor:
def __init__(
self,
vllm_config: VllmConfig,
tokenizer: AnyTokenizer,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
):
self.vllm_config = vllm_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
@@ -49,12 +47,10 @@ class Processor:
self.structured_outputs_config = vllm_config.structured_outputs_config
self.tokenizer = tokenizer
self.generation_config_fields = (
self.model_config.try_get_generation_config())
self.generation_config_fields = self.model_config.try_get_generation_config()
self.mm_registry = mm_registry
self.mm_processor_cache = processor_cache_from_config(
vllm_config, mm_registry)
self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
self.input_preprocessor = InputPreprocessor(
self.model_config,
@@ -79,7 +75,8 @@ class Processor:
if num_logprobs > max_logprobs:
raise ValueError(
f"Requested sample logprobs of {num_logprobs}, "
f"which is greater than max allowed: {max_logprobs}")
f"which is greater than max allowed: {max_logprobs}"
)
# Validate prompt logprobs.
if params.prompt_logprobs:
@@ -89,7 +86,8 @@ class Processor:
if num_prompt_logprobs > max_logprobs:
raise ValueError(
f"Requested prompt logprobs of {num_prompt_logprobs}, "
f"which is greater than max allowed: {max_logprobs}")
f"which is greater than max allowed: {max_logprobs}"
)
def _validate_sampling_params(
self,
@@ -108,8 +106,7 @@ class Processor:
return
vocab_size = len(self.tokenizer)
if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
raise ValueError(
"allowed_token_ids contains out-of-vocab token id!")
raise ValueError("allowed_token_ids contains out-of-vocab token id!")
def _validate_logit_bias(
self,
@@ -129,7 +126,8 @@ class Processor:
if invalid_token_ids:
raise ValueError(
f"token_id(s) {invalid_token_ids} in logit_bias contain "
f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
)
def _validate_supported_sampling_params(
self,
@@ -140,8 +138,9 @@ class Processor:
raise ValueError("vLLM V1 does not yet support best_of.")
# Logits processors not supported.
if params.logits_processors:
raise ValueError("vLLM V1 does not support per request "
"user provided logits processors.")
raise ValueError(
"vLLM V1 does not support per request user provided logits processors."
)
def _validate_params(
self,
@@ -178,18 +177,23 @@ class Processor:
for modality, items in mm_data.items():
if modality in mm_uuids:
data_len = len(items) if isinstance(items, list) else 1
uuid_len = len(mm_uuids[modality]) if isinstance(
mm_uuids[modality], list) else 1
uuid_len = (
len(mm_uuids[modality])
if isinstance(mm_uuids[modality], list)
else 1
)
if uuid_len != data_len:
raise ValueError(
f"multi_modal_uuids for modality '{modality}' "
"must have same length as data: got "
f"{uuid_len} uuids vs "
f"{data_len} items.")
f"{data_len} items."
)
else:
raise ValueError(
f"multi_modal_uuids for modality '{modality}' must "
"be provided if multi_modal_data is provided.")
"be provided if multi_modal_data is provided."
)
# Handle explicit encoder/decoder prompts or singleton prompt
if isinstance(prompt, dict) and "encoder_prompt" in prompt:
@@ -208,8 +212,9 @@ class Processor:
# LoRA request passed in while LoRA is not enabled
if not self.lora_config:
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
"not enabled!")
raise ValueError(
f"Got lora_request {lora_request} but LoRA is not enabled!"
)
if self.tokenizer is not None:
logger.warning_once(
@@ -217,7 +222,8 @@ class Processor:
"tokenizers for different LoRAs. By default, vLLM uses base "
"model's tokenizer. If you are using a LoRA "
"with its own tokenizer, consider specifying `--tokenizer "
"[lora_path]` to use the LoRA tokenizer.")
"[lora_path]` to use the LoRA tokenizer."
)
def _validate_structured_output(self, params: SamplingParams) -> None:
if not params.structured_outputs or not self.structured_outputs_config:
@@ -235,20 +241,23 @@ class Processor:
# to a specific backend based on `auto` behavior in a previous
# request. We remember that it was set as a result of `auto`
# using the `_backend_was_auto` field set in the params.
if (backend != _backend
and not (backend == "auto"
and params.structured_outputs._backend_was_auto)):
if backend != _backend and not (
backend == "auto" and params.structured_outputs._backend_was_auto
):
raise ValueError(
"Request-level structured output backend selection is not "
f"supported. The request specified '{_backend}', but vLLM "
f"was initialised with '{backend}'. This error can be "
"resolved by removing '_backend' from the request.")
"resolved by removing '_backend' from the request."
)
else:
params.structured_outputs._backend = backend
# Request content validation
if (isinstance(params.structured_outputs.choice, list)
and not params.structured_outputs.choice):
if (
isinstance(params.structured_outputs.choice, list)
and not params.structured_outputs.choice
):
# It is invalid for choice to be an empty list
raise ValueError(
f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501
@@ -318,9 +327,7 @@ class Processor:
mm_uuids: MultiModalUUIDDict = {}
for modality, data in mm_data.items():
n = len(data) if isinstance(data, list) else 1
mm_uuids[modality] = [
f"{request_id}-{modality}-{i}" for i in range(n)
]
mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
return mm_uuids
def process_inputs(
@@ -339,10 +346,13 @@ class Processor:
self._validate_params(params)
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
data_parallel_size):
raise ValueError(f"data_parallel_rank {data_parallel_rank} "
f"is out of range [0, {data_parallel_size}).")
if data_parallel_rank is not None and not (
0 <= data_parallel_rank < data_parallel_size
):
raise ValueError(
f"data_parallel_rank {data_parallel_rank} "
f"is out of range [0, {data_parallel_size})."
)
if arrival_time is None:
arrival_time = time.time()
@@ -355,9 +365,11 @@ class Processor:
# reused across requests, therefore identifying multimodal data items
# by their content is no longer necessary, and we create uuids with
# request id-modality-index as multimodal hash overrides.
if (self.model_config.multimodal_config and
self.model_config.multimodal_config.mm_processor_cache_gb == 0
and not self.cache_config.enable_prefix_caching):
if (
self.model_config.multimodal_config
and self.model_config.multimodal_config.mm_processor_cache_gb == 0
and not self.cache_config.enable_prefix_caching
):
mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
else:
# Otherwise, use user-provided uuids as multimodal hash overrides
@@ -378,6 +390,7 @@ class Processor:
mm_uuids=mm_uuids,
)
from vllm.platforms import current_platform
current_platform.validate_request(
prompt=prompt,
params=params,
@@ -393,10 +406,16 @@ class Processor:
# discriminated unions of TypedDicts, because of how it handles
# inheritance of TypedDict. If we explicitly extract the items we want
# we can avoid type errors from using `dict.get` later in the method.
prompt_token_ids = decoder_inputs[
"prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
"type"] == "embeds" else None
prompt_token_ids = (
decoder_inputs["prompt_token_ids"]
if decoder_inputs["type"] != "embeds"
else None
)
prompt_embeds = (
decoder_inputs["prompt_embeds"]
if decoder_inputs["type"] == "embeds"
else None
)
sampling_params = None
pooling_params = None
@@ -406,11 +425,12 @@ class Processor:
# If unset max tokens, then generate up to the max_model_len.
if sampling_params.max_tokens is None:
seq_len = length_from_prompt_token_ids_or_embeds(
prompt_token_ids, prompt_embeds)
sampling_params.max_tokens = \
self.model_config.max_model_len - seq_len
prompt_token_ids, prompt_embeds
)
sampling_params.max_tokens = self.model_config.max_model_len - seq_len
sampling_params.update_from_generation_config(
self.generation_config_fields, eos_token_id)
self.generation_config_fields, eos_token_id
)
if self.tokenizer is not None:
sampling_params.update_from_tokenizer(self.tokenizer)
else:
@@ -436,7 +456,9 @@ class Processor:
data=decoder_mm_inputs[modality][idx],
modality=modality,
identifier=decoder_mm_hashes[modality][idx],
mm_position=decoder_mm_positions[modality][idx]))
mm_position=decoder_mm_positions[modality][idx],
)
)
return EngineCoreRequest(
request_id=request_id,
@@ -454,8 +476,9 @@ class Processor:
trace_headers=trace_headers,
)
def _validate_model_inputs(self, encoder_inputs: Optional[SingletonInputs],
decoder_inputs: SingletonInputs):
def _validate_model_inputs(
self, encoder_inputs: Optional[SingletonInputs], decoder_inputs: SingletonInputs
):
if encoder_inputs is not None:
self._validate_model_input(encoder_inputs, prompt_type="encoder")
@@ -469,12 +492,17 @@ class Processor:
):
model_config = self.model_config
prompt_ids = None if prompt_inputs[
"type"] == "embeds" else prompt_inputs["prompt_token_ids"]
prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[
"type"] == "embeds" else None
prompt_len = length_from_prompt_token_ids_or_embeds(
prompt_ids, prompt_embeds)
prompt_ids = (
None
if prompt_inputs["type"] == "embeds"
else prompt_inputs["prompt_token_ids"]
)
prompt_embeds = (
prompt_inputs["prompt_embeds"]
if prompt_inputs["type"] == "embeds"
else None
)
prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
if not prompt_ids:
if prompt_type == "encoder" and model_config.is_multimodal_model:
pass # Mllama may have empty encoder inputs for text-only data
@@ -499,10 +527,10 @@ class Processor:
# Here we take the max of the two to determine if a token id is
# truly out-of-vocabulary.
if max_input_id > max(tokenizer.max_token_id,
self.model_config.get_vocab_size() - 1):
raise ValueError(
f"Token id {max_input_id} is out of vocabulary")
if max_input_id > max(
tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
):
raise ValueError(f"Token id {max_input_id} is out of vocabulary")
max_prompt_len = self.model_config.max_model_len
if prompt_len > max_prompt_len:
@@ -522,16 +550,19 @@ class Processor:
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image "
"inputs, the number of image tokens depends on the number "
"of images, and possibly their aspect ratios as well.")
"of images, and possibly their aspect ratios as well."
)
else:
suggestion = (
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens.")
"number of text tokens."
)
raise ValueError(
f"The {prompt_type} prompt (length {prompt_len}) is "
f"longer than the maximum model length of {max_prompt_len}. "
f"{suggestion}")
f"{suggestion}"
)
# TODO: Find out how many placeholder tokens are there so we can
# check that chunked prefill does not truncate them