From c583bcb4fc60772b8c4c58cf5d8f01cf5c293706 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 10 Apr 2026 02:45:05 +0000 Subject: [PATCH] Fix cudaMemPrefetchAsync for CUDA 13: use cudaMemLocation + flags=0 (no stream param) --- vllm/managed_alloc.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/managed_alloc.cu b/vllm/managed_alloc.cu index 446aaa3..acdf0c0 100644 --- a/vllm/managed_alloc.cu +++ b/vllm/managed_alloc.cu @@ -62,11 +62,11 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) { // Without this, the first write to each page faults into system RAM, // which causes OOM when the OS only has ~102 GiB after EGM carveout. // - // The prefetch is asynchronous on the given stream, so it won't block - // the calling thread. Subsequent operations on the same stream will - // wait for the prefetch to complete. + // CUDA 13+ signature: cudaMemPrefetchAsync(ptr, size, cudaMemLocation, flags) + // Note: no stream parameter in the cudaMemLocation overload. + // flags=0 means no special flags. if (size > 0) { - err = cudaMemPrefetchAsync(ptr, size, device, stream); + err = cudaMemPrefetchAsync(ptr, size, gpu_loc, 0); if (err != cudaSuccess) { // Non-fatal: prefetch failure shouldn't prevent allocation. // Pages will still be migrated on demand.