[Model Runner V2] Don't use UVA buffer for prefill_len (#29713)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-11-28 20:27:16 -08:00
committed by GitHub
parent 4b17ce6815
commit 4a80ad0a25
2 changed files with 6 additions and 1 deletions

View File

@@ -410,6 +410,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
cu_num_new_blocks[i].append(x + len(block_ids)) cu_num_new_blocks[i].append(x + len(block_ids))
new_block_ids[i].extend(block_ids) new_block_ids[i].extend(block_ids)
overwrite.append(True) overwrite.append(True)
if scheduler_output.scheduled_new_reqs:
self.req_states.prefill_len.copy_to_gpu()
# Add new blocks for the existing requests. # Add new blocks for the existing requests.
cached_reqs = scheduler_output.scheduled_cached_reqs cached_reqs = scheduler_output.scheduled_cached_reqs

View File

@@ -117,7 +117,10 @@ class RequestState:
self.prefill_token_ids = UvaBuffer( self.prefill_token_ids = UvaBuffer(
self.max_num_reqs, self.max_model_len, dtype=torch.int32 self.max_num_reqs, self.max_model_len, dtype=torch.int32
) )
self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32) # NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
# can be used outside of update_states and prepare_inputs.
# Without async barrier, using UVA can cause race conditions.
self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
# Number of computed tokens. # Number of computed tokens.
self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32) self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
self.num_computed_tokens = torch.zeros( self.num_computed_tokens = torch.zeros(