[Model Runner V2] Don't use UVA buffer for prefill_len (#29713)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-11-28 20:27:16 -08:00
parent 4b17ce6815
commit 4a80ad0a25
2 changed files with 6 additions and 1 deletions
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -410,6 +410,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                cu_num_new_blocks[i].append(x + len(block_ids))
                new_block_ids[i].extend(block_ids)
            overwrite.append(True)
+        if scheduler_output.scheduled_new_reqs:
+            self.req_states.prefill_len.copy_to_gpu()

        # Add new blocks for the existing requests.
        cached_reqs = scheduler_output.scheduled_cached_reqs