[Core] feat: Implement Priority Scheduling in V1 Engine (#19057)

Signed-off-by: amit <amit.man@gmail.com>
Co-authored-by: Roger Wang <Rogerw0108@gmail.com>
This commit is contained in:
amit
2025-06-23 06:18:08 +03:00
committed by GitHub
parent c4cf260677
commit 4a0f7888a3
7 changed files with 896 additions and 30 deletions

View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum
import time
from typing import TYPE_CHECKING, Any, Optional, Union
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -30,18 +31,23 @@ class Request:
pooling_params: Optional[PoolingParams],
eos_token_id: Optional[int],
client_index: int = 0,
arrival_time: Optional[float] = None,
lora_request: Optional["LoRARequest"] = None,
structured_output_request: Optional["StructuredOutputRequest"] = None,
cache_salt: Optional[str] = None,
priority: int = 0,
) -> None:
self.request_id = request_id
self.client_index = client_index
self.priority = priority
self.sampling_params = sampling_params
self.pooling_params = pooling_params
# Because of LoRA, the eos token id can be different for each request.
self.eos_token_id = eos_token_id
self.lora_request = lora_request
self.structured_output_request = structured_output_request
self.arrival_time = arrival_time if arrival_time is not None else \
time.time()
self.status = RequestStatus.WAITING
if sampling_params and sampling_params.guided_decoding is not None:
@@ -118,11 +124,13 @@ class Request:
sampling_params=request.sampling_params,
pooling_params=request.pooling_params,
eos_token_id=request.eos_token_id,
arrival_time=request.arrival_time,
lora_request=request.lora_request,
structured_output_request=StructuredOutputRequest(
sampling_params=request.sampling_params) \
if request.sampling_params else None,
cache_salt=request.cache_salt,
priority=request.priority,
)
def append_output_token_ids(