Fix cudaMemPrefetchAsync for CUDA 13: use cudaMemLocation + flags=0 (no stream param)

2026-04-10 02:45:05 +00:00
parent 6053e6d0ea
commit c583bcb4fc
1 changed files with 4 additions and 4 deletions
--- a/vllm/managed_alloc.cu
+++ b/vllm/managed_alloc.cu
@@ -62,11 +62,11 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
  // Without this, the first write to each page faults into system RAM,
  // which causes OOM when the OS only has ~102 GiB after EGM carveout.
  //
-  // The prefetch is asynchronous on the given stream, so it won't block
-  // the calling thread. Subsequent operations on the same stream will
-  // wait for the prefetch to complete.
+  // CUDA 13+ signature: cudaMemPrefetchAsync(ptr, size, cudaMemLocation, flags)
+  // Note: no stream parameter in the cudaMemLocation overload.
+  // flags=0 means no special flags.
  if (size > 0) {
-    err = cudaMemPrefetchAsync(ptr, size, device, stream);
+    err = cudaMemPrefetchAsync(ptr, size, gpu_loc, 0);
    if (err != cudaSuccess) {
      // Non-fatal: prefetch failure shouldn't prevent allocation.
      // Pages will still be migrated on demand.