diff --git a/managed_alloc.cu b/managed_alloc.cu index 2ae0ef595..53e11858e 100644 --- a/managed_alloc.cu +++ b/managed_alloc.cu @@ -7,10 +7,11 @@ // 2. cudaMemAdviseSetPreferredLocation(GPU) → driver prefers keeping pages on GPU // 3. cudaMemAdviseSetAccessedBy(CPU) → CPU can access over C2C NVLink without // triggering page migration back to system RAM (critical: prevents OOM) -// 4. NO prefetching — pages migrate on-demand via hardware page faults. -// Eager prefetching fills HBM+EGM and causes subsequent allocations -// to fail. On-demand migration is the correct behavior for unified -// memory with HBM + LPDDR EGM. +// 4. Selective prefetching — small allocations (model weights, <2 GiB) +// are prefetched to GPU so cuBLAS/cuDNN kernels can access them +// directly from HBM. Large allocations (KV cache blocks) stay in +// managed memory and page-fault on demand, since they're too large +// to fit in HBM and attention ops can tolerate page faults. #include #include @@ -57,19 +58,25 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) { cpu_loc.id = cudaCpuDeviceId; cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, cpu_loc); - // REMOVED: cudaMemPrefetchAsync — was causing allocation failures after - // model loading. Prefetching eagerly migrates ALL pages to GPU, filling - // up HBM+EGM. Once physical memory is consumed by prefetched pages, the - // next cudaMallocManaged call fails because the driver can't guarantee - // page-fault resolution for new allocations. - // - // On GH200 with EGM, the hardware handles page faults naturally via C2C - // NVLink. The cudaMemAdviseSetPreferredLocation(GPU) hint above tells - // the driver to prefer GPU placement, but allows fallback to LPDDR when - // HBM is full. That's exactly what we want — don't force it. - // - // Pages will migrate on-demand as they're accessed, which is the correct - // behavior for a unified memory system with 96 GiB HBM + 128+ GiB EGM. + // Selective prefetch: migrate pages to GPU for small allocations only. + // Model weights (individual tensors) are typically <2 GiB and MUST be + // on GPU for cuBLAS GEMM operations — GPU compute kernels cannot + // page-fault into managed memory during execution. + // KV cache blocks are large and numerous; prefetching them all fills + // HBM and causes subsequent allocations to fail. + // The 2 GiB threshold separates "compute data" from "cache data". + const size_t PREFETCH_THRESHOLD = 2ULL * 1024 * 1024 * 1024; // 2 GiB + + if (size > 0 && size < PREFETCH_THRESHOLD) { + err = cudaMemPrefetchAsync(ptr, size, gpu_loc, 0); + if (err != cudaSuccess) { + // Non-fatal: prefetch failure shouldn't prevent allocation. + // Pages will still be migrated on demand. + fprintf(stderr, "[managed_alloc] cudaMemPrefetchAsync warning: %s " + "(size=%.2f GiB, will use on-demand migration)\n", + cudaGetErrorString(err), (double)size / (1024.0*1024.0*1024.0)); + } + } return ptr; }