[V0 Deprecation] Remove Prompt Adapters (#20588)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-07-23 19:36:48 -04:00
committed by GitHub
parent 78c13e30e1
commit 82ec66f514
60 changed files with 126 additions and 1727 deletions

View File

@@ -19,7 +19,6 @@ from vllm.inputs import SingletonInputs
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
@@ -458,7 +457,6 @@ class Sequence:
block size used by the block manager and cache engine.
eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
lora_request: LoRA request.
prompt_adapter_request: Prompt Adapter request.
"""
def __init__(
@@ -468,14 +466,12 @@ class Sequence:
block_size: int,
eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> None:
self.seq_id = seq_id
self.inputs = inputs
self.block_size = block_size
self.eos_token_id = eos_token_id
self.lora_request = lora_request
self.prompt_adapter_request = prompt_adapter_request
self.data = SequenceData.from_seqs(
self.prompt_token_ids,
@@ -537,11 +533,6 @@ class Sequence:
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
@property
def prompt_adapter_id(self) -> int:
return self.prompt_adapter_request.prompt_adapter_id \
if self.prompt_adapter_request else 0
def get_output_text_to_return(self, buffer_length: int,
delta: bool) -> str:
"""If delta is True, only new text since the last call to
@@ -601,12 +592,12 @@ class Sequence:
designed for prefix caching mode. The final sequence hash is determined
by applying token_ids from the sequence's blocks.
"""
if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
if self.lora_int_id == 0:
return None
# NOTE: If there are additional factors influencing the block aside from
# token_ids, include them as input parameters to the hash.
return hash((self.prompt_adapter_id, self.lora_int_id))
return hash(self.lora_int_id)
def num_hashed_tokens_of_block(self, logical_idx: int):
return logical_idx * self.block_size + self.block_size
@@ -707,7 +698,6 @@ class SequenceGroup:
encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers.
prompt_adapter_request: Prompt Adapter request.
priority: User-defined priority of the request.
draft_size: The number of speculative tokens plus one from the target
model; equal to max number of tokens a step can generate
@@ -725,7 +715,6 @@ class SequenceGroup:
pooled_data: Optional[torch.Tensor] = None,
encoder_seq: Optional[Sequence] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
draft_size: int = 1) -> None:
self.request_id = request_id
@@ -747,7 +736,6 @@ class SequenceGroup:
self.state = SequenceGroupState()
self.pooling_params = pooling_params
self.pooled_data = pooled_data
self.prompt_adapter_request = prompt_adapter_request
self.encoder_seq = encoder_seq
self.trace_headers = trace_headers
self.priority = priority
@@ -802,16 +790,6 @@ class SequenceGroup:
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
@property
def prompt_adapter_id(self) -> int:
return self.prompt_adapter_request.prompt_adapter_id \
if self.prompt_adapter_request else 0
@property
def prompt_adapter_num_virtual_tokens(self) -> int:
return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\
if self.prompt_adapter_request else 0
def init_multi_step(self, num_steps: int) -> None:
self.state.num_steps = num_steps
self.state.current_step = 0
@@ -1011,7 +989,6 @@ class SequenceGroupMetadata(
(SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder
model.
prompt_adapter_request: Prompt Adapter request.
"""
request_id: str
@@ -1030,7 +1007,6 @@ class SequenceGroupMetadata(
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
encoder_seq_data: Optional[SequenceData] = None
cross_block_table: Optional[list[int]] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
token_chunk_size: Optional[int] = None
### Stateful fields that are lazily defined. ###
@@ -1052,16 +1028,6 @@ class SequenceGroupMetadata(
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
@property
def prompt_adapter_id(self) -> int:
return self.prompt_adapter_request.prompt_adapter_id \
if self.prompt_adapter_request else 0
@property
def prompt_adapter_num_virtual_tokens(self) -> int:
return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \
if self.prompt_adapter_request else 0
# Multi-Step Chunked-Prefill property
@property
def is_single_step_prompt(self) -> bool:
@@ -1525,7 +1491,6 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
pooled_data=seq_group.pooled_data,
encoder_seq=seq_group.encoder_seq,
trace_headers=seq_group.trace_headers,
prompt_adapter_request=seq_group.prompt_adapter_request,
priority=seq_group.priority,
)