diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index 8a2894154..760ec8acb 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -45,7 +45,6 @@ class MockModelConfig: multimodal_config = MultiModalConfig() hf_config = MockHFConfig() hf_text_config = MockHFConfig() - logits_processor_pattern = None logits_processors: list[str] | None = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index bbf97534f..800bf75f0 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -44,7 +44,6 @@ class MockModelConfig: tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() - logits_processor_pattern = None logits_processors: list[str] | None = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index db7fbe2f8..56fe31556 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -45,7 +45,6 @@ class MockModelConfig: multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) logits_processors: list[str] | None = None - logits_processor_pattern: str | None = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" allowed_media_domains: list[str] | None = None diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index ef9d944ab..b57f00ab7 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -521,7 +521,6 @@ class MockModelConfig: hf_config = MockHFConfig() hf_text_config = MockHFConfig() logits_processors: list[str] | None = None - logits_processor_pattern = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" allowed_media_domains: list[str] | None = None diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index a75a37bef..fff953323 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -144,20 +144,6 @@ def test_bad_words(llm): assert not contains_bad_word(new_text, new_tokens, bad_words_2) -def test_logits_processor(llm): - """Check that we reject logits processor.""" - - # This sample logits processor gives infinite score to the i-th token, - # where i is the length of the input sequence. - # We therefore expect the output token sequence to be [0, 1, 2, ...] - def pick_ith(token_ids, logits): - logits[len(token_ids)] = float("inf") - return logits - - with pytest.raises(ValueError): - _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith])) - - def test_allowed_token_ids(llm): """Check that we can use allowed_token_ids.""" diff --git a/vllm/config/model.py b/vllm/config/model.py index 5fd7d2d73..0a5ff385f 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -252,10 +252,6 @@ class ModelConfig: hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.""" - logits_processor_pattern: str | None = None - """Optional regex pattern specifying valid logits processor qualified names - that can be passed with the `logits_processors` extra completion argument. - Defaults to `None`, which allows no processors.""" generation_config: str = "auto" """The folder path to the generation config. Defaults to `"auto"`, the generation config will be loaded from model path. If set to `"vllm"`, no @@ -342,7 +338,6 @@ class ModelConfig: "config_format", "hf_token", "hf_overrides", - "logits_processor_pattern", "override_attention_dtype", "logits_processors", "io_processor_plugin", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2d1e2feb9..84176e207 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -508,8 +508,6 @@ class EngineArgs: reasoning_parser: str = StructuredOutputsConfig.reasoning_parser reasoning_parser_plugin: str | None = None - logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern - speculative_config: dict[str, Any] | None = None show_hidden_metrics_for_version: str | None = ( @@ -710,9 +708,6 @@ class EngineArgs: ) model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"]) - model_group.add_argument( - "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"] - ) model_group.add_argument( "--generation-config", **model_kwargs["generation_config"] ) @@ -1320,7 +1315,6 @@ class EngineArgs: mm_encoder_tp_mode=self.mm_encoder_tp_mode, mm_encoder_attn_backend=self.mm_encoder_attn_backend, pooler_config=self.pooler_config, - logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, override_generation_config=self.override_generation_config, enable_sleep_mode=self.enable_sleep_mode, @@ -1429,7 +1423,7 @@ class EngineArgs: self.model_weights = model_config.model_weights self.tokenizer = model_config.tokenizer - self._check_feature_supported(model_config) + self._check_feature_supported() self._set_default_chunked_prefill_and_prefix_caching_args(model_config) self._set_default_max_num_seqs_and_batched_tokens_args( usage_context, model_config @@ -1831,11 +1825,8 @@ class EngineArgs: return config - def _check_feature_supported(self, model_config: ModelConfig): + def _check_feature_supported(self): """Raise an error if the feature is not supported.""" - if self.logits_processor_pattern != EngineArgs.logits_processor_pattern: - _raise_unsupported_error(feature_name="--logits-processor-pattern") - # No Concurrent Partial Prefills so far. if ( self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py index d905a59af..71e59152a 100644 --- a/vllm/entrypoints/openai/chat_completion/protocol.py +++ b/vllm/entrypoints/openai/chat_completion/protocol.py @@ -26,13 +26,11 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, FunctionDefinition, LegacyStructuralTagResponseFormat, - LogitsProcessors, OpenAIBaseModel, StreamOptions, StructuralTagResponseFormat, ToolCall, UsageInfo, - get_logits_processors, ) from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger @@ -293,19 +291,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "through out the inference process and return in response." ), ) - logits_processors: LogitsProcessors | None = Field( - default=None, - description=( - "A list of either qualified names of logits processors, or " - "constructor objects, to apply when sampling. A constructor is " - "a JSON object with a required 'qualname' field specifying the " - "qualified name of the processor class/factory, and optional " - "'args' and 'kwargs' fields containing positional and keyword " - "arguments. For example: {'qualname': " - "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " - "{'param': 'value'}}." - ), - ) + return_tokens_as_token_ids: bool | None = Field( default=None, description=( @@ -324,6 +310,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "need to map generated text back to input tokens." ), ) + cache_salt: str | None = Field( default=None, description=( @@ -335,6 +322,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "to 256 bit)." ), ) + kv_transfer_params: dict[str, Any] | None = Field( default=None, description="KVTransfer parameters used for disaggregated serving.", @@ -417,7 +405,6 @@ class ChatCompletionRequest(OpenAIBaseModel): def to_sampling_params( self, max_tokens: int, - logits_processor_pattern: str | None, default_sampling_params: dict, ) -> SamplingParams: # Default parameters @@ -502,9 +489,6 @@ class ChatCompletionRequest(OpenAIBaseModel): min_tokens=self.min_tokens, skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, - logits_processors=get_logits_processors( - self.logits_processors, logits_processor_pattern - ), include_stop_str_in_output=self.include_stop_str_in_output, truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 761ae9a50..7b54e6daf 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -86,7 +86,6 @@ from vllm.tool_parsers import ToolParser from vllm.tool_parsers.mistral_tool_parser import MistralToolCall from vllm.tool_parsers.utils import partial_json_loads from vllm.utils.collection_utils import as_list -from vllm.v1.sample.logits_processor import validate_logits_processors_parameters logger = init_logger(__name__) @@ -130,9 +129,6 @@ class OpenAIServingChat(OpenAIServing): self.enable_log_outputs = enable_log_outputs self.enable_log_deltas = enable_log_deltas - # set up logits processors - self.logits_processors = self.model_config.logits_processors - # set up reasoning parser self.reasoning_parser_cls = ParserManager.get_reasoning_parser( reasoning_parser_name=reasoning_parser @@ -403,13 +399,8 @@ class OpenAIServingChat(OpenAIServing): else: sampling_params = request.to_sampling_params( max_tokens, - self.model_config.logits_processor_pattern, self.default_sampling_params, ) - validate_logits_processors_parameters( - self.logits_processors, - sampling_params, - ) self._log_inputs( sub_request_id, diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py index aab733082..904c9eca4 100644 --- a/vllm/entrypoints/openai/completion/protocol.py +++ b/vllm/entrypoints/openai/completion/protocol.py @@ -15,12 +15,10 @@ from vllm.config import ModelConfig from vllm.entrypoints.openai.engine.protocol import ( AnyResponseFormat, LegacyStructuralTagResponseFormat, - LogitsProcessors, OpenAIBaseModel, StreamOptions, StructuralTagResponseFormat, UsageInfo, - get_logits_processors, ) from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger @@ -117,19 +115,6 @@ class CompletionRequest(OpenAIBaseModel): "through out the inference process and return in response." ), ) - logits_processors: LogitsProcessors | None = Field( - default=None, - description=( - "A list of either qualified names of logits processors, or " - "constructor objects, to apply when sampling. A constructor is " - "a JSON object with a required 'qualname' field specifying the " - "qualified name of the processor class/factory, and optional " - "'args' and 'kwargs' fields containing positional and keyword " - "arguments. For example: {'qualname': " - "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " - "{'param': 'value'}}." - ), - ) return_tokens_as_token_ids: bool | None = Field( default=None, @@ -221,7 +206,6 @@ class CompletionRequest(OpenAIBaseModel): def to_sampling_params( self, max_tokens: int, - logits_processor_pattern: str | None, default_sampling_params: dict | None = None, ) -> SamplingParams: if default_sampling_params is None: @@ -312,9 +296,6 @@ class CompletionRequest(OpenAIBaseModel): skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, - logits_processors=get_logits_processors( - self.logits_processors, logits_processor_pattern - ), truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index 0353625fe..994cc094a 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -42,7 +42,6 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tokenizers import TokenizerLike from vllm.utils.async_utils import merge_async_iterators from vllm.utils.collection_utils import as_list -from vllm.v1.sample.logits_processor import validate_logits_processors_parameters logger = init_logger(__name__) @@ -67,9 +66,6 @@ class OpenAIServingCompletion(OpenAIServing): log_error_stack=log_error_stack, ) - # set up logits processors - self.logits_processors = self.model_config.logits_processors - self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage @@ -178,13 +174,8 @@ class OpenAIServingCompletion(OpenAIServing): else: sampling_params = request.to_sampling_params( max_tokens, - self.model_config.logits_processor_pattern, self.default_sampling_params, ) - validate_logits_processors_parameters( - self.logits_processors, - sampling_params, - ) request_id_item = f"{request_id}-{i}" diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index dd354190f..5603e5dc4 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -15,7 +15,6 @@ from pydantic.dataclasses import dataclass from vllm.config import ModelConfig, SpeculativeConfig, StructuredOutputsConfig from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger -from vllm.logits_process import LogitsProcessor from vllm.tokenizers import TokenizerLike from vllm.v1.serial_utils import PydanticMsgspecMixin @@ -207,11 +206,6 @@ class SamplingParams( """Whether to skip special tokens in the output.""" spaces_between_special_tokens: bool = True """Whether to add spaces between special tokens in the output.""" - # `list[LogitsProcessor] | None` type. We use Any here because - # `list[LogitsProcessor] | None` type is not supported by msgspec. - logits_processors: Any | None = None - """Functions that modify logits based on previously generated tokens, and - optionally prompt tokens as a first argument.""" include_stop_str_in_output: bool = False """Whether to include the stop strings in output text.""" truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None @@ -277,7 +271,6 @@ class SamplingParams( detokenize: bool = True, skip_special_tokens: bool = True, spaces_between_special_tokens: bool = True, - logits_processors: list[LogitsProcessor] | None = None, truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, structured_outputs: StructuredOutputsParams | None = None, @@ -318,7 +311,6 @@ class SamplingParams( detokenize=detokenize, skip_special_tokens=skip_special_tokens, spaces_between_special_tokens=spaces_between_special_tokens, - logits_processors=logits_processors, truncate_prompt_tokens=truncate_prompt_tokens, output_kind=output_kind, structured_outputs=structured_outputs, @@ -455,11 +447,6 @@ class SamplingParams( parameter="prompt_logprobs", value=self.prompt_logprobs, ) - if self.logits_processors: - # TODO: Remove `logits_processors` attribute - raise ValueError( - "vLLM V1 does not support per request user-provided logits processors." - ) if self.truncate_prompt_tokens is not None and ( self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1 ): @@ -573,28 +560,11 @@ class SamplingParams( return self._bad_words_token_ids def clone(self) -> "SamplingParams": - """Deep copy, but maybe not the LogitsProcessor objects. - - LogitsProcessor objects may contain an arbitrary, nontrivial amount of - data that is expensive to copy. However, if not copied, the processor - needs to support parallel decoding for multiple sequences - See https://github.com/vllm-project/vllm/issues/3087 - - If skip_clone is True, uses shallow copy instead of deep copy. - """ - + """If skip_clone is True, uses shallow copy instead of deep copy.""" if self.skip_clone: return copy.copy(self) - logit_processor_refs = ( - None - if self.logits_processors is None - else { - id(lp): lp.clone() if hasattr(lp, "clone") else lp - for lp in self.logits_processors - } - ) - return copy.deepcopy(self, memo=logit_processor_refs) + return copy.deepcopy(self) def verify( self, @@ -605,6 +575,7 @@ class SamplingParams( ) -> None: self._validate_logprobs(model_config) self._validate_logit_bias(model_config) + self._validate_logits_processors(model_config) self._validate_allowed_token_ids(tokenizer) self._validate_spec_decode(speculative_config) self._validate_structured_outputs(structured_outputs_config, tokenizer) @@ -658,6 +629,13 @@ class SamplingParams( value=invalid_token_ids, ) + def _validate_logits_processors(self, model_config: ModelConfig) -> None: + from vllm.v1.sample.logits_processor import ( + validate_logits_processors_parameters, + ) + + validate_logits_processors_parameters(model_config.logits_processors, self) + def _validate_allowed_token_ids(self, tokenizer: TokenizerLike | None) -> None: allowed_token_ids = self.allowed_token_ids if allowed_token_ids is None: