Fix cudaMemPrefetchAsync for CUDA 13: use cudaMemLocation + flags=0 (no stream param)
This commit is contained in:
@@ -62,11 +62,11 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
|
||||
// Without this, the first write to each page faults into system RAM,
|
||||
// which causes OOM when the OS only has ~102 GiB after EGM carveout.
|
||||
//
|
||||
// The prefetch is asynchronous on the given stream, so it won't block
|
||||
// the calling thread. Subsequent operations on the same stream will
|
||||
// wait for the prefetch to complete.
|
||||
// CUDA 13+ signature: cudaMemPrefetchAsync(ptr, size, cudaMemLocation, flags)
|
||||
// Note: no stream parameter in the cudaMemLocation overload.
|
||||
// flags=0 means no special flags.
|
||||
if (size > 0) {
|
||||
err = cudaMemPrefetchAsync(ptr, size, device, stream);
|
||||
err = cudaMemPrefetchAsync(ptr, size, gpu_loc, 0);
|
||||
if (err != cudaSuccess) {
|
||||
// Non-fatal: prefetch failure shouldn't prevent allocation.
|
||||
// Pages will still be migrated on demand.
|
||||
|
||||
Reference in New Issue
Block a user