From c583bcb4fc60772b8c4c58cf5d8f01cf5c293706 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Fri, 10 Apr 2026 02:45:05 +0000
Subject: [PATCH] Fix cudaMemPrefetchAsync for CUDA 13: use cudaMemLocation +
 flags=0 (no stream param)

---
 vllm/managed_alloc.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/managed_alloc.cu b/vllm/managed_alloc.cu
index 446aaa3..acdf0c0 100644
--- a/vllm/managed_alloc.cu
+++ b/vllm/managed_alloc.cu
@@ -62,11 +62,11 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
   // Without this, the first write to each page faults into system RAM,
   // which causes OOM when the OS only has ~102 GiB after EGM carveout.
   //
-  // The prefetch is asynchronous on the given stream, so it won't block
-  // the calling thread. Subsequent operations on the same stream will
-  // wait for the prefetch to complete.
+  // CUDA 13+ signature: cudaMemPrefetchAsync(ptr, size, cudaMemLocation, flags)
+  // Note: no stream parameter in the cudaMemLocation overload.
+  // flags=0 means no special flags.
   if (size > 0) {
-    err = cudaMemPrefetchAsync(ptr, size, device, stream);
+    err = cudaMemPrefetchAsync(ptr, size, gpu_loc, 0);
     if (err != cudaSuccess) {
       // Non-fatal: prefetch failure shouldn't prevent allocation.
       // Pages will still be migrated on demand.