[Bugfix][Nixl] Fix full prefix cache hit bug (#18632)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-04 22:07:32 -04:00
parent 78dcf56cb3
commit c56ed8bb0e
4 changed files with 97 additions and 81 deletions
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -424,11 +424,11 @@ class Scheduler(SchedulerInterface):
                    # The request cannot be scheduled.
                    break

-                # KVConnector: update internal state after allocation.
+                # KVTransfer: the connector uses this info to determine
+                # if a load is needed. Note that
                # This information is used to determine if a load is
                # needed for this request.
-                if num_external_computed_tokens:
-                    assert self.connector is not None
+                if self.connector is not None:
                    self.connector.update_state_after_alloc(
                        request,
                        new_computed_blocks + new_blocks,
@@ -841,7 +841,7 @@ class Scheduler(SchedulerInterface):
        }

        finished_req_ids = self.finished_req_ids_dict
-        if finished_req_ids is not None:
+        if finished_req_ids:
            # Include ids of requests that finished since last outputs
            # were sent.
            for client_index, finished_set in finished_req_ids.items():