CMM: Fix OOM and subprocess crashes for GH200 EGM
Key changes: - managed_alloc.cu: Add cudaMemPrefetchAsync to migrate pages to GPU immediately (prevents OOM from system RAM pinning on EGM systems where only ~102 GiB RAM remains). Add cudaMemAdviseSetAccessedBy for CPU so reads go over C2C NVLink without page migration. - vllm_managed_mem.py: Rewrite with idempotent patches, proper MemorySnapshot.measure() override, and torch.cuda tracking stubs for CUDAPluggableAllocator compatibility. - sitecustomize.py: Auto-loaded by Python in ALL subprocesses (including vLLM EngineCore). Applies allocator swap, torch patches, MemorySnapshot override, and request_memory override before any CUDA operations in spawned processes. - Dockerfile: Install sitecustomize.py into Python dist-packages. - README.md: Full rewrite with EGM problem statement, memory layout, architecture diagram, and build pipeline documentation.
This commit is contained in:
@@ -1,6 +1,15 @@
|
||||
// managed_alloc.cu - cudaMallocManaged allocator for PyTorch
|
||||
// Compile: nvcc -shared -o libmanaged_alloc.so managed_alloc.cu -Xcompiler -fPIC
|
||||
// Compatible with CUDA 13+ (uses cudaMemLocation API)
|
||||
//
|
||||
// Key design decisions for GH200 EGM:
|
||||
// 1. cudaMallocManaged → allocations can page-fault across HBM + EGM
|
||||
// 2. cudaMemAdviseSetPreferredLocation(GPU) → driver prefers keeping pages on GPU
|
||||
// 3. cudaMemAdviseSetAccessedBy(CPU) → CPU can access over C2C NVLink without
|
||||
// triggering page migration back to system RAM (critical: prevents OOM)
|
||||
// 4. cudaMemPrefetchAsync(GPU) → actively migrates pages to GPU immediately,
|
||||
// so subsequent writes go to HBM/EGM, not system RAM (prevents OOM on
|
||||
// systems where EGM carved out most of system memory)
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
|
||||
@@ -28,13 +37,44 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Advise the driver to prefer GPU placement initially.
|
||||
// On GH200 with EGM, the hardware will migrate pages as needed.
|
||||
// CUDA 13+ uses cudaMemLocation struct instead of int for device
|
||||
cudaMemLocation location;
|
||||
location.type = cudaMemLocationTypeDevice;
|
||||
location.id = device;
|
||||
cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, location);
|
||||
cudaMemLocation gpu_loc;
|
||||
gpu_loc.type = cudaMemLocationTypeDevice;
|
||||
gpu_loc.id = device;
|
||||
|
||||
// Advise: prefer GPU placement. On GH200 with EGM, the hardware will
|
||||
// migrate pages as needed, but the driver tries to keep them on GPU.
|
||||
cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, gpu_loc);
|
||||
|
||||
// Advise: CPU will access this memory too. On GH200, this sets up
|
||||
// remote mapping over C2C NVLink so CPU can read/write without
|
||||
// triggering page migration back to system RAM. This is CRITICAL
|
||||
// to prevent OOM on EGM systems where most system RAM was carved
|
||||
// out for the GPU.
|
||||
cudaMemLocation cpu_loc;
|
||||
cpu_loc.type = cudaMemLocationTypeHost;
|
||||
cpu_loc.id = cudaCpuDeviceId;
|
||||
cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, cpu_loc);
|
||||
|
||||
// Prefetch to GPU immediately. This actively migrates the virtual
|
||||
// pages to the GPU side so that subsequent writes (e.g., model weight
|
||||
// loading) go directly to HBM/EGM instead of pinning system RAM.
|
||||
// Without this, the first write to each page faults into system RAM,
|
||||
// which causes OOM when the OS only has ~102 GiB after EGM carveout.
|
||||
//
|
||||
// The prefetch is asynchronous on the given stream, so it won't block
|
||||
// the calling thread. Subsequent operations on the same stream will
|
||||
// wait for the prefetch to complete.
|
||||
if (size > 0) {
|
||||
err = cudaMemPrefetchAsync(ptr, size, gpu_loc, stream);
|
||||
if (err != cudaSuccess) {
|
||||
// Non-fatal: prefetch failure shouldn't prevent allocation.
|
||||
// Pages will still be migrated on demand.
|
||||
fprintf(stderr, "[managed_alloc] cudaMemPrefetchAsync warning: %s "
|
||||
"(size=%.2f GiB, will use on-demand migration)\n",
|
||||
cudaGetErrorString(err), (double)size / (1024.0*1024.0*1024.0));
|
||||
}
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
@@ -42,7 +82,8 @@ void* managed_malloc(size_t size, int device, cudaStream_t stream) {
|
||||
// PyTorch pluggable allocator signature: void(void*, size_t, int, cudaStream_t)
|
||||
void managed_free(void* ptr, size_t size, int device, cudaStream_t stream) {
|
||||
if (ptr != nullptr) {
|
||||
// Sync the stream before freeing to avoid use-after-free
|
||||
// Sync the stream before freeing to avoid use-after-free with
|
||||
// managed memory (in-flight page faults can race with deallocation).
|
||||
if (stream != nullptr) {
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user