[Misc] Reorganize inputs (#35182)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-03-26 01:22:54 +08:00
parent 678b3c99e8
commit ba2f0acc2d
142 changed files with 1212 additions and 1342 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -57,9 +57,9 @@ from vllm.entrypoints.pooling.score.utils import (
    validate_score_input,
 )
 from vllm.entrypoints.utils import log_non_default_args
-from vllm.inputs.data import (
+from vllm.inputs import (
    DataPrompt,
-    ProcessorInputs,
+    EngineInput,
    PromptType,
    SingletonPrompt,
    TextPrompt,
@@ -589,7 +589,7 @@ class LLM:

    def _resolve_mm_lora(
        self,
-        prompt: ProcessorInputs,
+        prompt: EngineInput,
        lora_request: LoRARequest | None,
    ) -> LoRARequest | None:
        if prompt["type"] != "multimodal":
@@ -716,8 +716,8 @@ class LLM:
        eos_token_id = tokenizer.eos_token_id
        sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty)

-        engine_prompts = self._preprocess_cmpl(prompts)
-        lora_requests = self._lora_request_to_seq(lora_request, len(engine_prompts))
+        engine_inputs = self._preprocess_cmpl(prompts)
+        lora_requests = self._lora_request_to_seq(lora_request, len(engine_inputs))

        if use_tqdm and concurrency_limit is not None:
            logger.warning(
@@ -727,7 +727,7 @@ class LLM:
            use_tqdm = False

        if concurrency_limit is None:
-            concurrency_limit = len(engine_prompts)
+            concurrency_limit = len(engine_inputs)

        # generate 2 * beam_width candidates at each step
        # following the huggingface transformers implementation
@@ -740,7 +740,7 @@ class LLM:
        )
        instances: list[BeamSearchInstance] = []

-        for lora_req, prompt in zip(lora_requests, engine_prompts):
+        for lora_req, prompt in zip(lora_requests, engine_inputs):
            if prompt["type"] == "embeds":
                raise NotImplementedError(
                    "Embedding prompt not supported for beam search"
@@ -845,7 +845,7 @@ class LLM:
        self,
        prompts: Sequence[PromptType],
        tokenization_kwargs: dict[str, Any] | None = None,
-    ) -> Sequence[ProcessorInputs]:
+    ) -> Sequence[EngineInput]:
        """
        Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into
        a format that can be passed to `_add_request`.
@@ -853,7 +853,7 @@ class LLM:
        Refer to [LLM.generate][] for a complete description of the arguments.

        Returns:
-            A list of `ProcessorInputs` objects ready to be passed into LLMEngine.
+            A list of `EngineInput` objects ready to be passed into LLMEngine.
        """
        renderer = self.renderer
        model_config = self.model_config
@@ -871,9 +871,9 @@ class LLM:
        self,
        prompt: PromptType,
        tokenization_kwargs: dict[str, Any] | None = None,
-    ) -> ProcessorInputs:
-        (engine_prompt,) = self._preprocess_cmpl([prompt], tokenization_kwargs)
-        return engine_prompt
+    ) -> EngineInput:
+        (engine_input,) = self._preprocess_cmpl([prompt], tokenization_kwargs)
+        return engine_input

    def _preprocess_chat(
        self,
@@ -886,7 +886,7 @@ class LLM:
        tools: list[dict[str, Any]] | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
        mm_processor_kwargs: dict[str, Any] | None = None,
-    ) -> Sequence[ProcessorInputs]:
+    ) -> Sequence[EngineInput]:
        """
        Convert a list of conversations into prompts so that they can then
        be used as input for other LLM APIs.
@@ -894,7 +894,7 @@ class LLM:
        Refer to [LLM.chat][] for a complete description of the arguments.

        Returns:
-            A list of `ProcessorInputs` objects ready to be passed into LLMEngine.
+            A list of `EngineInput` objects ready to be passed into LLMEngine.
        """
        renderer = self.renderer

@@ -915,14 +915,14 @@ class LLM:
            **(tokenization_kwargs or {})
        )

-        _, engine_prompts = renderer.render_chat(
+        _, engine_inputs = renderer.render_chat(
            conversations,
            chat_params,
            tok_params,
            prompt_extras={"mm_processor_kwargs": mm_processor_kwargs},
        )

-        return engine_prompts
+        return engine_inputs

    def _preprocess_chat_one(
        self,
@@ -935,8 +935,8 @@ class LLM:
        tools: list[dict[str, Any]] | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
        mm_processor_kwargs: dict[str, Any] | None = None,
-    ) -> ProcessorInputs:
-        (engine_prompt,) = self._preprocess_chat(
+    ) -> EngineInput:
+        (engine_input,) = self._preprocess_chat(
            [conversation],
            chat_template=chat_template,
            chat_template_content_format=chat_template_content_format,
@@ -948,7 +948,7 @@ class LLM:
            mm_processor_kwargs=mm_processor_kwargs,
        )

-        return engine_prompt
+        return engine_input

    def chat(
        self,
@@ -1909,7 +1909,7 @@ class LLM:

    def _render_and_run_requests(
        self,
-        prompts: Iterable[ProcessorInputs],
+        prompts: Iterable[EngineInput],
        params: Sequence[SamplingParams | PoolingParams],
        output_type: type[_O],
        *,
@@ -1938,7 +1938,7 @@ class LLM:

    def _render_and_add_requests(
        self,
-        prompts: Iterable[ProcessorInputs],
+        prompts: Iterable[EngineInput],
        params: Sequence[SamplingParams | PoolingParams],
        *,
        lora_requests: Sequence[LoRARequest | None] | None = None,
@@ -1967,7 +1967,7 @@ class LLM:

    def _add_request(
        self,
-        prompt: ProcessorInputs,
+        prompt: EngineInput,
        params: SamplingParams | PoolingParams,
        lora_request: LoRARequest | None = None,
        priority: int = 0,