From a116f969301acfdb6ea9fa917815566d434fdc95 Mon Sep 17 00:00:00 2001
From: sbeurnier <sbeurnier@together.ai>
Date: Sat, 14 Mar 2026 02:37:32 +0100
Subject: [PATCH] [V1] Remove pin_memory() in async_copy_to_gpu to fix sporadic
 stalls (#37006)

Signed-off-by: Sebastien Beurnier <sbeurnier@together.ai>
---
 vllm/v1/worker/gpu/buffer_utils.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/gpu/buffer_utils.py b/vllm/v1/worker/gpu/buffer_utils.py
index 75cf6bdb7..a653c2625 100644
--- a/vllm/v1/worker/gpu/buffer_utils.py
+++ b/vllm/v1/worker/gpu/buffer_utils.py
@@ -27,12 +27,10 @@ def async_copy_to_gpu(
         assert device is not None
         out = torch.empty_like(x, device=device)
 
-    # CPU-to-CPU copy
-    tmp = x.pin_memory()
-    assert tmp is not x
-
-    # CPU-to-GPU copy
-    return out.copy_(tmp, non_blocking=True)
+    # Copy directly to GPU — explicit pin_memory() causes sporadic stalls
+    # under high concurrency due to CUDA driver contention. The driver
+    # handles the transfer efficiently without manual pinning.
+    return out.copy_(x, non_blocking=True)
 
 
 class UvaBuffer: