From a116f969301acfdb6ea9fa917815566d434fdc95 Mon Sep 17 00:00:00 2001 From: sbeurnier Date: Sat, 14 Mar 2026 02:37:32 +0100 Subject: [PATCH] [V1] Remove pin_memory() in async_copy_to_gpu to fix sporadic stalls (#37006) Signed-off-by: Sebastien Beurnier --- vllm/v1/worker/gpu/buffer_utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu/buffer_utils.py b/vllm/v1/worker/gpu/buffer_utils.py index 75cf6bdb7..a653c2625 100644 --- a/vllm/v1/worker/gpu/buffer_utils.py +++ b/vllm/v1/worker/gpu/buffer_utils.py @@ -27,12 +27,10 @@ def async_copy_to_gpu( assert device is not None out = torch.empty_like(x, device=device) - # CPU-to-CPU copy - tmp = x.pin_memory() - assert tmp is not x - - # CPU-to-GPU copy - return out.copy_(tmp, non_blocking=True) + # Copy directly to GPU — explicit pin_memory() causes sporadic stalls + # under high concurrency due to CUDA driver contention. The driver + # handles the transfer efficiently without manual pinning. + return out.copy_(x, non_blocking=True) class UvaBuffer: