[V1] Remove legacy input registry (#15673)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -14,10 +14,11 @@ from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
|
||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
@@ -48,7 +49,7 @@ class AsyncLLM(EngineClient):
|
||||
executor_class: type[Executor],
|
||||
log_stats: bool,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
use_cached_outputs: bool = False,
|
||||
log_requests: bool = True,
|
||||
start_engine_loop: bool = True,
|
||||
@@ -90,7 +91,7 @@ class AsyncLLM(EngineClient):
|
||||
self.processor = Processor(
|
||||
vllm_config=vllm_config,
|
||||
tokenizer=self.tokenizer,
|
||||
input_registry=input_registry,
|
||||
mm_registry=mm_registry,
|
||||
)
|
||||
|
||||
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
|
||||
|
||||
@@ -11,7 +11,7 @@ from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.distributed import stateless_destroy_torch_distributed_process_group
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.metrics_types import StatLoggerBase
|
||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
@@ -44,7 +44,6 @@ class LLMEngine:
|
||||
log_stats: bool,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
|
||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
use_cached_outputs: bool = False,
|
||||
multiprocess_mode: bool = False,
|
||||
@@ -80,7 +79,6 @@ class LLMEngine:
|
||||
# Processor (convert Inputs --> EngineCoreRequests)
|
||||
self.processor = Processor(vllm_config=vllm_config,
|
||||
tokenizer=self.tokenizer,
|
||||
input_registry=input_registry,
|
||||
mm_registry=mm_registry)
|
||||
|
||||
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
|
||||
|
||||
@@ -5,8 +5,7 @@ from collections.abc import Mapping
|
||||
from typing import Optional, Union
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
|
||||
PromptType, SingletonInputsAdapter)
|
||||
from vllm.inputs import ProcessorInputs, PromptType
|
||||
from vllm.inputs.parse import split_enc_dec_inputs
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.lora.request import LoRARequest
|
||||
@@ -31,7 +30,6 @@ class Processor:
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
tokenizer: BaseTokenizerGroup,
|
||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
):
|
||||
|
||||
@@ -210,7 +208,6 @@ class Processor:
|
||||
self._validate_model_inputs(processed_inputs, lora_request)
|
||||
|
||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||
decoder_inputs = SingletonInputsAdapter(decoder_inputs)
|
||||
|
||||
# TODO: Impl encoder-decoder
|
||||
if encoder_inputs is not None:
|
||||
@@ -221,8 +218,9 @@ class Processor:
|
||||
sampling_params = params.clone()
|
||||
# If unset max tokens, then generate up to the max_model_len.
|
||||
if sampling_params.max_tokens is None:
|
||||
sampling_params.max_tokens = (self.model_config.max_model_len -
|
||||
len(decoder_inputs.prompt_token_ids))
|
||||
sampling_params.max_tokens = (
|
||||
self.model_config.max_model_len -
|
||||
len(decoder_inputs["prompt_token_ids"]))
|
||||
sampling_params.update_from_generation_config(
|
||||
self.generation_config_fields, eos_token_id)
|
||||
sampling_params.update_from_tokenizer(
|
||||
@@ -232,8 +230,8 @@ class Processor:
|
||||
sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
|
||||
sorted_mm_positions: Optional[list[PlaceholderRange]] = None
|
||||
sorted_mm_hashes: Optional[list[str]] = None
|
||||
if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
|
||||
assert isinstance(decoder_mm_inputs, MultiModalKwargs)
|
||||
if decoder_inputs["type"] == "multimodal":
|
||||
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
||||
|
||||
# The output of merged multi-modal processor (`decoder_mm_inputs`)
|
||||
# contains the kwargs for all items from all modalities.
|
||||
@@ -254,8 +252,8 @@ class Processor:
|
||||
sorted_mm_positions,
|
||||
sorted_mm_hashes,
|
||||
) = merge_and_sort_multimodal_metadata(
|
||||
decoder_inputs.multi_modal_placeholders,
|
||||
decoder_inputs.multi_modal_hashes if self.use_hash else None,
|
||||
decoder_inputs["mm_placeholders"],
|
||||
decoder_inputs["mm_hashes"] if self.use_hash else None,
|
||||
)
|
||||
|
||||
# NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
|
||||
@@ -281,8 +279,8 @@ class Processor:
|
||||
|
||||
return EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
prompt=decoder_inputs.prompt,
|
||||
prompt_token_ids=decoder_inputs.prompt_token_ids,
|
||||
prompt=decoder_inputs.get("prompt"),
|
||||
prompt_token_ids=decoder_inputs["prompt_token_ids"],
|
||||
mm_inputs=sorted_mm_inputs,
|
||||
mm_hashes=sorted_mm_hashes,
|
||||
mm_placeholders=sorted_mm_positions,
|
||||
|
||||
Reference in New Issue
Block a user