CMM: Fix OOM and subprocess crashes for GH200 EGM

Key changes: - managed_alloc.cu: Add cudaMemPrefetchAsync to migrate pages to GPU immediately (prevents OOM from system RAM pinning on EGM systems where only ~102 GiB RAM remains). Add cudaMemAdviseSetAccessedBy for CPU so reads go over C2C NVLink without page migration. - vllm_managed_mem.py: Rewrite with idempotent patches, proper MemorySnapshot.measure() override, and torch.cuda tracking stubs for CUDAPluggableAllocator compatibility. - sitecustomize.py: Auto-loaded by Python in ALL subprocesses (including vLLM EngineCore). Applies allocator swap, torch patches, MemorySnapshot override, and request_memory override before any CUDA operations in spawned processes. - Dockerfile: Install sitecustomize.py into Python dist-packages. - README.md: Full rewrite with EGM problem statement, memory layout, architecture diagram, and build pipeline documentation.
2026-04-09 23:25:48 +00:00
parent 079eb88d7d
commit aadde3ddf9
5 changed files with 460 additions and 96 deletions
--- a/vllm/managed_alloc.cu
+++ b/vllm/managed_alloc.cu
@@ -1,6 +1,15 @@
 // managed_alloc.cu - cudaMallocManaged allocator for PyTorch
 // Compile: nvcc -shared -o libmanaged_alloc.so managed_alloc.cu -Xcompiler -fPIC
 // Compatible with CUDA 13+ (uses cudaMemLocation API)
+//
+// Key design decisions for GH200 EGM:
+//   1. cudaMallocManaged → allocations can page-fault across HBM + EGM
+//   2. cudaMemAdviseSetPreferredLocation(GPU) → driver prefers keeping pages on GPU
+//   3. cudaMemAdviseSetAccessedBy(CPU) → CPU can access over C2C NVLink without
+//      triggering page migration back to system RAM (critical: prevents OOM)
+//   4. cudaMemPrefetchAsync(GPU) → actively migrates pages to GPU immediately,
+//      so subsequent writes go to HBM/EGM, not system RAM (prevents OOM on
+//      systems where EGM carved out most of system memory)
 #include <cuda_runtime.h>
 #include <stdio.h>

@@ -28,13 +37,44 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
    return nullptr;
  }

-  // Advise the driver to prefer GPU placement initially.
-  // On GH200 with EGM, the hardware will migrate pages as needed.
  // CUDA 13+ uses cudaMemLocation struct instead of int for device
-  cudaMemLocation location;
-  location.type = cudaMemLocationTypeDevice;
-  location.id = device;
-  cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, location);
+  cudaMemLocation gpu_loc;
+  gpu_loc.type = cudaMemLocationTypeDevice;
+  gpu_loc.id = device;
+
+  // Advise: prefer GPU placement. On GH200 with EGM, the hardware will
+  // migrate pages as needed, but the driver tries to keep them on GPU.
+  cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, gpu_loc);
+
+  // Advise: CPU will access this memory too. On GH200, this sets up
+  // remote mapping over C2C NVLink so CPU can read/write without
+  // triggering page migration back to system RAM. This is CRITICAL
+  // to prevent OOM on EGM systems where most system RAM was carved
+  // out for the GPU.
+  cudaMemLocation cpu_loc;
+  cpu_loc.type = cudaMemLocationTypeHost;
+  cpu_loc.id = cudaCpuDeviceId;
+  cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, cpu_loc);
+
+  // Prefetch to GPU immediately. This actively migrates the virtual
+  // pages to the GPU side so that subsequent writes (e.g., model weight
+  // loading) go directly to HBM/EGM instead of pinning system RAM.
+  // Without this, the first write to each page faults into system RAM,
+  // which causes OOM when the OS only has ~102 GiB after EGM carveout.
+  //
+  // The prefetch is asynchronous on the given stream, so it won't block
+  // the calling thread. Subsequent operations on the same stream will
+  // wait for the prefetch to complete.
+  if (size > 0) {
+    err = cudaMemPrefetchAsync(ptr, size, gpu_loc, stream);
+    if (err != cudaSuccess) {
+      // Non-fatal: prefetch failure shouldn't prevent allocation.
+      // Pages will still be migrated on demand.
+      fprintf(stderr, "[managed_alloc] cudaMemPrefetchAsync warning: %s "
+                      "(size=%.2f GiB, will use on-demand migration)\n",
+              cudaGetErrorString(err), (double)size / (1024.0*1024.0*1024.0));
+    }
+  }

  return ptr;
 }
@@ -42,7 +82,8 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
 // PyTorch pluggable allocator signature: void(void*, size_t, int, cudaStream_t)
 void managed_free(void* ptr, size_t size, int device, cudaStream_t stream) {
  if (ptr != nullptr) {
-    // Sync the stream before freeing to avoid use-after-free
+    // Sync the stream before freeing to avoid use-after-free with
+    // managed memory (in-flight page faults can race with deallocation).
    if (stream != nullptr) {
      cudaStreamSynchronize(stream);
    }