Fix cudaMemPrefetchAsync for CUDA 13: use cudaMemLocation + flags=0 (no stream param)

This commit is contained in:
2026-04-10 02:45:05 +00:00
parent 6053e6d0ea
commit c583bcb4fc

View File

@@ -62,11 +62,11 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
// Without this, the first write to each page faults into system RAM,
// which causes OOM when the OS only has ~102 GiB after EGM carveout.
//
// The prefetch is asynchronous on the given stream, so it won't block
// the calling thread. Subsequent operations on the same stream will
// wait for the prefetch to complete.
// CUDA 13+ signature: cudaMemPrefetchAsync(ptr, size, cudaMemLocation, flags)
// Note: no stream parameter in the cudaMemLocation overload.
// flags=0 means no special flags.
if (size > 0) {
err = cudaMemPrefetchAsync(ptr, size, device, stream);
err = cudaMemPrefetchAsync(ptr, size, gpu_loc, 0);
if (err != cudaSuccess) {
// Non-fatal: prefetch failure shouldn't prevent allocation.
// Pages will still be migrated on demand.