From 24dc30f7ff4b304088297ffa2b34ab9aba07bea8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 21 Jan 2026 13:17:43 -0800 Subject: [PATCH] [ModelRunner V2] Don't pin reused flashinfer tensors (#32799) Signed-off-by: Nick Hill --- vllm/v1/attention/backends/flashinfer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 4743e2321..69d24deb2 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -603,7 +603,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): "earlier GPUs." ) # Preparing persistent buffers - self.pin_memory = is_pin_memory_available() + # Since we do not have explicit synchronization in ModelRunnerV2, we do not pin + # reused CPU buffers to avoid a race condition between step N async copies to + # GPU and step N+1 buffer updates. + self.pin_memory = ( + not envs.VLLM_USE_V2_MODEL_RUNNER and is_pin_memory_available() + ) self.paged_kv_indptr = self._make_buffer(max_num_reqs + 1) self.paged_kv_indptr_cpu_buffer = torch.zeros_like( self.paged_kv_indptr.cpu, pin_memory=self.pin_memory