[V0 Deprecation] Remove Prompt Adapters (#20588)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -19,7 +19,6 @@ from vllm.inputs import SingletonInputs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
|
||||
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
|
||||
@@ -458,7 +457,6 @@ class Sequence:
|
||||
block size used by the block manager and cache engine.
|
||||
eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
|
||||
lora_request: LoRA request.
|
||||
prompt_adapter_request: Prompt Adapter request.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -468,14 +466,12 @@ class Sequence:
|
||||
block_size: int,
|
||||
eos_token_id: Optional[int] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> None:
|
||||
self.seq_id = seq_id
|
||||
self.inputs = inputs
|
||||
self.block_size = block_size
|
||||
self.eos_token_id = eos_token_id
|
||||
self.lora_request = lora_request
|
||||
self.prompt_adapter_request = prompt_adapter_request
|
||||
|
||||
self.data = SequenceData.from_seqs(
|
||||
self.prompt_token_ids,
|
||||
@@ -537,11 +533,6 @@ class Sequence:
|
||||
def lora_int_id(self) -> int:
|
||||
return self.lora_request.lora_int_id if self.lora_request else 0
|
||||
|
||||
@property
|
||||
def prompt_adapter_id(self) -> int:
|
||||
return self.prompt_adapter_request.prompt_adapter_id \
|
||||
if self.prompt_adapter_request else 0
|
||||
|
||||
def get_output_text_to_return(self, buffer_length: int,
|
||||
delta: bool) -> str:
|
||||
"""If delta is True, only new text since the last call to
|
||||
@@ -601,12 +592,12 @@ class Sequence:
|
||||
designed for prefix caching mode. The final sequence hash is determined
|
||||
by applying token_ids from the sequence's blocks.
|
||||
"""
|
||||
if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
|
||||
if self.lora_int_id == 0:
|
||||
return None
|
||||
|
||||
# NOTE: If there are additional factors influencing the block aside from
|
||||
# token_ids, include them as input parameters to the hash.
|
||||
return hash((self.prompt_adapter_id, self.lora_int_id))
|
||||
return hash(self.lora_int_id)
|
||||
|
||||
def num_hashed_tokens_of_block(self, logical_idx: int):
|
||||
return logical_idx * self.block_size + self.block_size
|
||||
@@ -707,7 +698,6 @@ class SequenceGroup:
|
||||
encoder_seq: Optional, the single encoder sequence. Should be None
|
||||
unless you are working with an encoder/decoder model.
|
||||
trace_headers: OpenTelemetry trace headers.
|
||||
prompt_adapter_request: Prompt Adapter request.
|
||||
priority: User-defined priority of the request.
|
||||
draft_size: The number of speculative tokens plus one from the target
|
||||
model; equal to max number of tokens a step can generate
|
||||
@@ -725,7 +715,6 @@ class SequenceGroup:
|
||||
pooled_data: Optional[torch.Tensor] = None,
|
||||
encoder_seq: Optional[Sequence] = None,
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
priority: int = 0,
|
||||
draft_size: int = 1) -> None:
|
||||
self.request_id = request_id
|
||||
@@ -747,7 +736,6 @@ class SequenceGroup:
|
||||
self.state = SequenceGroupState()
|
||||
self.pooling_params = pooling_params
|
||||
self.pooled_data = pooled_data
|
||||
self.prompt_adapter_request = prompt_adapter_request
|
||||
self.encoder_seq = encoder_seq
|
||||
self.trace_headers = trace_headers
|
||||
self.priority = priority
|
||||
@@ -802,16 +790,6 @@ class SequenceGroup:
|
||||
def lora_int_id(self) -> int:
|
||||
return self.lora_request.lora_int_id if self.lora_request else 0
|
||||
|
||||
@property
|
||||
def prompt_adapter_id(self) -> int:
|
||||
return self.prompt_adapter_request.prompt_adapter_id \
|
||||
if self.prompt_adapter_request else 0
|
||||
|
||||
@property
|
||||
def prompt_adapter_num_virtual_tokens(self) -> int:
|
||||
return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\
|
||||
if self.prompt_adapter_request else 0
|
||||
|
||||
def init_multi_step(self, num_steps: int) -> None:
|
||||
self.state.num_steps = num_steps
|
||||
self.state.current_step = 0
|
||||
@@ -1011,7 +989,6 @@ class SequenceGroupMetadata(
|
||||
(SequenceGroup.encoder_seq). Should be None
|
||||
unless you are working with an encoder/decoder
|
||||
model.
|
||||
prompt_adapter_request: Prompt Adapter request.
|
||||
"""
|
||||
|
||||
request_id: str
|
||||
@@ -1030,7 +1007,6 @@ class SequenceGroupMetadata(
|
||||
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
|
||||
encoder_seq_data: Optional[SequenceData] = None
|
||||
cross_block_table: Optional[list[int]] = None
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None
|
||||
token_chunk_size: Optional[int] = None
|
||||
|
||||
### Stateful fields that are lazily defined. ###
|
||||
@@ -1052,16 +1028,6 @@ class SequenceGroupMetadata(
|
||||
def lora_int_id(self) -> int:
|
||||
return self.lora_request.lora_int_id if self.lora_request else 0
|
||||
|
||||
@property
|
||||
def prompt_adapter_id(self) -> int:
|
||||
return self.prompt_adapter_request.prompt_adapter_id \
|
||||
if self.prompt_adapter_request else 0
|
||||
|
||||
@property
|
||||
def prompt_adapter_num_virtual_tokens(self) -> int:
|
||||
return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \
|
||||
if self.prompt_adapter_request else 0
|
||||
|
||||
# Multi-Step Chunked-Prefill property
|
||||
@property
|
||||
def is_single_step_prompt(self) -> bool:
|
||||
@@ -1525,7 +1491,6 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
|
||||
pooled_data=seq_group.pooled_data,
|
||||
encoder_seq=seq_group.encoder_seq,
|
||||
trace_headers=seq_group.trace_headers,
|
||||
prompt_adapter_request=seq_group.prompt_adapter_request,
|
||||
priority=seq_group.priority,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user