[V0 Deprecation] Remove Prompt Adapters (#20588)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -44,7 +44,6 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor
|
||||
from vllm.outputs import (PoolingRequestOutput, RequestOutput,
|
||||
RequestOutputFactory)
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
|
||||
PoolingSequenceGroupOutput, Sequence, SequenceGroup,
|
||||
@@ -223,7 +222,6 @@ class LLMEngine:
|
||||
self.load_config = vllm_config.load_config
|
||||
self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa
|
||||
)
|
||||
self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa
|
||||
self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa
|
||||
)
|
||||
|
||||
@@ -294,8 +292,6 @@ class LLMEngine:
|
||||
# Feature flags
|
||||
"enable_lora":
|
||||
bool(self.lora_config),
|
||||
"enable_prompt_adapter":
|
||||
bool(self.prompt_adapter_config),
|
||||
"enable_prefix_caching":
|
||||
self.cache_config.enable_prefix_caching,
|
||||
"enforce_eager":
|
||||
@@ -542,9 +538,6 @@ class LLMEngine:
|
||||
self.lora_config.verify_with_model_config(self.model_config)
|
||||
self.lora_config.verify_with_scheduler_config(
|
||||
self.scheduler_config)
|
||||
if self.prompt_adapter_config:
|
||||
self.prompt_adapter_config.verify_with_model_config(
|
||||
self.model_config)
|
||||
|
||||
def _add_processed_request(
|
||||
self,
|
||||
@@ -553,7 +546,6 @@ class LLMEngine:
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
arrival_time: float,
|
||||
lora_request: Optional[LoRARequest],
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
priority: int = 0,
|
||||
) -> Optional[SequenceGroup]:
|
||||
@@ -569,7 +561,6 @@ class LLMEngine:
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
priority=priority,
|
||||
)
|
||||
return None
|
||||
@@ -583,11 +574,10 @@ class LLMEngine:
|
||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||
|
||||
seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
|
||||
lora_request, prompt_adapter_request)
|
||||
lora_request)
|
||||
|
||||
encoder_seq = (None if encoder_inputs is None else Sequence(
|
||||
seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
|
||||
prompt_adapter_request))
|
||||
seq_id, encoder_inputs, block_size, eos_token_id, lora_request))
|
||||
|
||||
# Create a SequenceGroup based on SamplingParams or PoolingParams
|
||||
if isinstance(params, SamplingParams):
|
||||
@@ -598,7 +588,6 @@ class LLMEngine:
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
encoder_seq=encoder_seq,
|
||||
priority=priority)
|
||||
elif isinstance(params, PoolingParams):
|
||||
@@ -608,7 +597,6 @@ class LLMEngine:
|
||||
params,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
encoder_seq=encoder_seq,
|
||||
priority=priority)
|
||||
else:
|
||||
@@ -637,7 +625,6 @@ class LLMEngine:
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
priority: int = 0,
|
||||
) -> None:
|
||||
"""Add a request to the engine's request pool.
|
||||
@@ -658,7 +645,6 @@ class LLMEngine:
|
||||
the current monotonic time.
|
||||
lora_request: The LoRA request to add.
|
||||
trace_headers: OpenTelemetry trace headers.
|
||||
prompt_adapter_request: The prompt adapter request to add.
|
||||
priority: The priority of the request.
|
||||
Only applicable with priority scheduling.
|
||||
|
||||
@@ -719,7 +705,6 @@ class LLMEngine:
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
self._add_processed_request(
|
||||
@@ -728,7 +713,6 @@ class LLMEngine:
|
||||
params=params,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=priority,
|
||||
)
|
||||
@@ -741,7 +725,6 @@ class LLMEngine:
|
||||
arrival_time: float,
|
||||
lora_request: Optional[LoRARequest],
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
encoder_seq: Optional[Sequence] = None,
|
||||
priority: int = 0,
|
||||
) -> SequenceGroup:
|
||||
@@ -769,17 +752,15 @@ class LLMEngine:
|
||||
if self.vllm_config.speculative_config is not None:
|
||||
draft_size = \
|
||||
self.vllm_config.speculative_config.num_speculative_tokens + 1
|
||||
seq_group = SequenceGroup(
|
||||
request_id=request_id,
|
||||
seqs=[seq],
|
||||
arrival_time=arrival_time,
|
||||
sampling_params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
encoder_seq=encoder_seq,
|
||||
priority=priority,
|
||||
draft_size=draft_size)
|
||||
seq_group = SequenceGroup(request_id=request_id,
|
||||
seqs=[seq],
|
||||
arrival_time=arrival_time,
|
||||
sampling_params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
encoder_seq=encoder_seq,
|
||||
priority=priority,
|
||||
draft_size=draft_size)
|
||||
|
||||
return seq_group
|
||||
|
||||
@@ -790,7 +771,6 @@ class LLMEngine:
|
||||
pooling_params: PoolingParams,
|
||||
arrival_time: float,
|
||||
lora_request: Optional[LoRARequest],
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
encoder_seq: Optional[Sequence] = None,
|
||||
priority: int = 0,
|
||||
) -> SequenceGroup:
|
||||
@@ -798,15 +778,13 @@ class LLMEngine:
|
||||
# Defensive copy of PoolingParams, which are used by the pooler
|
||||
pooling_params = pooling_params.clone()
|
||||
# Create the sequence group.
|
||||
seq_group = SequenceGroup(
|
||||
request_id=request_id,
|
||||
seqs=[seq],
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
pooling_params=pooling_params,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
encoder_seq=encoder_seq,
|
||||
priority=priority)
|
||||
seq_group = SequenceGroup(request_id=request_id,
|
||||
seqs=[seq],
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
pooling_params=pooling_params,
|
||||
encoder_seq=encoder_seq,
|
||||
priority=priority)
|
||||
return seq_group
|
||||
|
||||
def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
|
||||
@@ -1834,16 +1812,6 @@ class LLMEngine:
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
return self.model_executor.pin_lora(lora_id)
|
||||
|
||||
def add_prompt_adapter(
|
||||
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
|
||||
return self.model_executor.add_prompt_adapter(prompt_adapter_request)
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
return self.model_executor.remove_prompt_adapter(prompt_adapter_id)
|
||||
|
||||
def list_prompt_adapters(self) -> List[int]:
|
||||
return self.model_executor.list_prompt_adapters()
|
||||
|
||||
def start_profile(self) -> None:
|
||||
self.model_executor.start_profile()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user