diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 094602a8b..aafa2f0a9 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -171,19 +171,23 @@ class Scheduler(SchedulerInterface): assert num_new_tokens > 0 # Schedule encoder inputs. - encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = ( - self._try_schedule_encoder_inputs(request, - request.num_computed_tokens, - num_new_tokens, - encoder_budget)) - if num_new_tokens == 0: - # The request cannot be scheduled because the encoder budget - # or the encoder cache is exhausted. - # NOTE(woosuk): Here, by doing `continue` instead of `break`, - # we do not strictly follow the FCFS scheduling policy and - # allow the lower-priority requests to be scheduled. - req_index += 1 - continue + if request.has_encoder_inputs: + (encoder_inputs_to_schedule, num_new_tokens, + new_encoder_budget) = self._try_schedule_encoder_inputs( + request, request.num_computed_tokens, num_new_tokens, + encoder_budget) + if num_new_tokens == 0: + # The request cannot be scheduled because the encoder budget + # or the encoder cache is exhausted. + # NOTE(woosuk): By using `continue` instead of `break` here, + # we intentionally relax the strict FCFS scheduling policy + # to allow lower-priority requests to be scheduled when a + # higher-priority request is blocked by encoder constraints. + req_index += 1 + continue + else: + encoder_inputs_to_schedule = None + new_encoder_budget = encoder_budget while True: new_blocks = self.kv_cache_manager.allocate_slots( @@ -318,13 +322,17 @@ class Scheduler(SchedulerInterface): assert num_new_tokens > 0 # Schedule encoder inputs. - (encoder_inputs_to_schedule, num_new_tokens, - new_encoder_budget) = self._try_schedule_encoder_inputs( - request, num_computed_tokens, num_new_tokens, - encoder_budget) - if num_new_tokens == 0: - # The request cannot be scheduled. - break + if request.has_encoder_inputs: + (encoder_inputs_to_schedule, num_new_tokens, + new_encoder_budget) = self._try_schedule_encoder_inputs( + request, num_computed_tokens, num_new_tokens, + encoder_budget) + if num_new_tokens == 0: + # The request cannot be scheduled. + break + else: + encoder_inputs_to_schedule = None + new_encoder_budget = encoder_budget new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens, computed_blocks) @@ -506,9 +514,6 @@ class Scheduler(SchedulerInterface): limitations, the method adjusts `num_new_tokens` to schedule only the decoder tokens up to just before the unschedulable encoder input. """ - if not request.has_encoder_inputs(): - return [], num_new_tokens, encoder_budget - encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions assert mm_positions is not None diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 48e513267..490fe4e83 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -59,6 +59,8 @@ class Request: self.mm_positions = multi_modal_placeholders or [] self.mm_inputs = multi_modal_inputs or [] self.mm_hashes: list[str] = multi_modal_hashes or [] + self.num_encoder_inputs = len(self.mm_inputs) + self.has_encoder_inputs = self.num_encoder_inputs > 0 # Sanity check assert len(self.mm_inputs) == len(self.mm_positions) @@ -117,13 +119,6 @@ class Request: def get_finished_reason(self) -> Union[FinishReason, None]: return RequestStatus.get_finished_reason(self.status) - def has_encoder_inputs(self) -> bool: - return len(self.mm_inputs) > 0 - - @property - def num_encoder_inputs(self) -> int: - return len(self.mm_positions) - def get_num_encoder_tokens(self, input_id: int) -> int: assert input_id < len(self.mm_positions) num_tokens = self.mm_positions[input_id]["length"]