diff --git a/vllm/managed_alloc.cu b/vllm/managed_alloc.cu index dd1230f..446aaa3 100644 --- a/vllm/managed_alloc.cu +++ b/vllm/managed_alloc.cu @@ -66,7 +66,7 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) { // the calling thread. Subsequent operations on the same stream will // wait for the prefetch to complete. if (size > 0) { - err = cudaMemPrefetchAsync(ptr, size, gpu_loc, stream); + err = cudaMemPrefetchAsync(ptr, size, device, stream); if (err != cudaSuccess) { // Non-fatal: prefetch failure shouldn't prevent allocation. // Pages will still be migrated on demand.