diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 2fe452421..e4b77f24b 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -3652,7 +3652,7 @@ def test_prepend_skipped_requests_order(): assert waiting_reqs == expected_waiting_reqs -def test_remote_kv_promotion_keeps_fcfs_with_fsm_prefix(): +def test_remote_kv_promotion_keeps_fcfs_with_grammar_prefix(): scheduler = create_scheduler(max_num_seqs=1) scheduler.connector = Mock() scheduler.connector.get_num_new_matched_tokens.return_value = (0, False) @@ -3661,19 +3661,20 @@ def test_remote_kv_promotion_keeps_fcfs_with_fsm_prefix(): for request in requests: scheduler.add_request(request) - req_fsm_1, req_fsm_2, req_remote, req_tail = list(scheduler.waiting) + req_grammar_1, req_grammar_2, req_remote, req_tail = list(scheduler.waiting) - # simulate two FSM requests at the waiting head that become ready now. - req_fsm_1.status = RequestStatus.WAITING_FOR_FSM - req_fsm_1.structured_output_request = Mock(grammar=object()) - req_fsm_2.status = RequestStatus.WAITING_FOR_FSM - req_fsm_2.structured_output_request = Mock(grammar=object()) + # simulate two structured-output grammar requests at the waiting head + # that become ready now. + req_grammar_1.status = RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR + req_grammar_1.structured_output_request = Mock(grammar=object()) + req_grammar_2.status = RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR + req_grammar_2.structured_output_request = Mock(grammar=object()) # simulate a remote-KV request that is ready to be promoted now. req_remote.status = RequestStatus.WAITING_FOR_REMOTE_KVS - scheduler.waiting.remove_requests([req_fsm_1, req_fsm_2, req_remote]) - scheduler.skipped_waiting.add_request(req_fsm_1) - scheduler.skipped_waiting.add_request(req_fsm_2) + scheduler.waiting.remove_requests([req_grammar_1, req_grammar_2, req_remote]) + scheduler.skipped_waiting.add_request(req_grammar_1) + scheduler.skipped_waiting.add_request(req_grammar_2) scheduler.skipped_waiting.add_request(req_remote) scheduler.finished_recving_kv_req_ids.add(req_remote.request_id) scheduler._update_waiting_for_remote_kv = Mock() @@ -3681,13 +3682,13 @@ def test_remote_kv_promotion_keeps_fcfs_with_fsm_prefix(): output = scheduler.schedule() assert output.scheduled_new_reqs - assert output.scheduled_new_reqs[0].req_id == req_fsm_1.request_id + assert output.scheduled_new_reqs[0].req_id == req_grammar_1.request_id waiting_req_ids = [ req.request_id for req in list(scheduler.skipped_waiting) + list(scheduler.waiting) ] assert waiting_req_ids == [ - req_fsm_2.request_id, + req_grammar_2.request_id, req_remote.request_id, req_tail.request_id, ] @@ -3700,28 +3701,32 @@ def test_fcfs_mixed_skipped_waiting_types_keep_order(): mk_req = lambda req_id, num_tokens=1: create_requests( # noqa: E731 num_requests=1, num_tokens=num_tokens, req_ids=[req_id] )[0] - req_fsm, req_remote, req_stream = mk_req("fsm"), mk_req("remote"), mk_req("stream") + req_grammar, req_remote, req_stream = ( + mk_req("grammar"), + mk_req("remote"), + mk_req("stream"), + ) req_regular, req_tail = mk_req("regular", 20), mk_req("tail") - req_fsm.status = RequestStatus.WAITING_FOR_FSM - req_fsm.structured_output_request = Mock(grammar=None) + req_grammar.status = RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR + req_grammar.structured_output_request = Mock(grammar=None) req_remote.status = RequestStatus.WAITING_FOR_REMOTE_KVS req_stream.status = RequestStatus.WAITING_FOR_STREAMING_REQ - for req in (req_fsm, req_remote, req_stream, req_regular, req_tail): + for req in (req_grammar, req_remote, req_stream, req_regular, req_tail): scheduler.add_request(req) scheduler.schedule() - assert list(scheduler.skipped_waiting) == [req_fsm, req_remote, req_stream] + assert list(scheduler.skipped_waiting) == [req_grammar, req_remote, req_stream] scheduler.finish_requests(req_regular.request_id, RequestStatus.FINISHED_ABORTED) assert not scheduler.running - req_fsm.structured_output_request = Mock(grammar=object()) + req_grammar.structured_output_request = Mock(grammar=object()) scheduler.finished_recving_kv_req_ids.add(req_remote.request_id) req_stream.status = RequestStatus.WAITING second_output = scheduler.schedule() expected_order = [ - req_fsm.request_id, + req_grammar.request_id, req_remote.request_id, req_stream.request_id, req_tail.request_id, diff --git a/tests/v1/test_request.py b/tests/v1/test_request.py index e22809d2e..66e46faa7 100644 --- a/tests/v1/test_request.py +++ b/tests/v1/test_request.py @@ -6,7 +6,10 @@ from vllm.v1.request import RequestStatus def test_request_status_fmt_str(): """Test that the string representation of RequestStatus is correct.""" assert f"{RequestStatus.WAITING}" == "WAITING" - assert f"{RequestStatus.WAITING_FOR_FSM}" == "WAITING_FOR_FSM" + assert ( + f"{RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR}" + == "WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR" + ) assert f"{RequestStatus.WAITING_FOR_REMOTE_KVS}" == "WAITING_FOR_REMOTE_KVS" assert f"{RequestStatus.WAITING_FOR_STREAMING_REQ}" == "WAITING_FOR_STREAMING_REQ" assert f"{RequestStatus.RUNNING}" == "RUNNING" diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a4c154a39..f576bd872 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1550,7 +1550,7 @@ class Scheduler(SchedulerInterface): @staticmethod def _is_blocked_waiting_status(status: RequestStatus) -> bool: return status in ( - RequestStatus.WAITING_FOR_FSM, + RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR, RequestStatus.WAITING_FOR_REMOTE_KVS, RequestStatus.WAITING_FOR_STREAMING_REQ, ) @@ -2084,7 +2084,7 @@ class Scheduler(SchedulerInterface): request.status = RequestStatus.WAITING return True - if request.status == RequestStatus.WAITING_FOR_FSM: + if request.status == RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR: structured_output_req = request.structured_output_request if not (structured_output_req and structured_output_req.grammar): return False diff --git a/vllm/v1/request.py b/vllm/v1/request.py index f2ee33b49..946e71c15 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -102,7 +102,7 @@ class Request: assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens if self.structured_output_request is not None: - self.status = RequestStatus.WAITING_FOR_FSM + self.status = RequestStatus.WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR if sampling_params.extra_args is not None: self.kv_transfer_params = sampling_params.extra_args.get( @@ -296,7 +296,7 @@ class RequestStatus(enum.IntEnum): """Status of a request.""" WAITING = enum.auto() - WAITING_FOR_FSM = enum.auto() + WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR = enum.auto() WAITING_FOR_REMOTE_KVS = enum.auto() WAITING_FOR_STREAMING_REQ = enum.auto() RUNNING = enum.auto() diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index cd17a21d9..213b49f28 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -42,7 +42,8 @@ class StructuredOutputManager: # When in external_launcher mode, async grammar compilation causes deadlocks # due to external_launcher mode having a scheduler for each TP rank. - # Async grammar compilation causes the WAITING_FOR_FSM → WAITING transition to + # Async grammar compilation causes the + # WAITING_FOR_STRUCTURED_OUTPUT_GRAMMAR → WAITING transition to # happen at different times on different TP ranks, # breaking the determinism assumption that external_launcher relies on. self._use_async_grammar_compilation = (