Add cudaMallocManaged allocator for GH200 EGM support
- managed_alloc.cu: PyTorch pluggable allocator using cudaMallocManaged - vllm_managed_mem.py: Launcher that patches vLLM for managed memory - Dockerfile: Build and install managed memory components This enables vLLM to use cudaMallocManaged for transparent page-fault access to both HBM (~96 GiB) and LPDDR (EGM, up to 480 GiB additional) on GH200 systems with Extended GPU Memory enabled. Experimental branch: v0.19.0-cmm
This commit is contained in:
48
vllm/managed_alloc.cu
Normal file
48
vllm/managed_alloc.cu
Normal file
@@ -0,0 +1,48 @@
|
||||
// managed_alloc.cu - cudaMallocManaged allocator for PyTorch
|
||||
// Compile: nvcc -shared -o libmanaged_alloc.so managed_alloc.cu -Xcompiler -fPIC
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
// PyTorch pluggable allocator signature: void*(size_t, int, cudaStream_t)
|
||||
void* managed_malloc(size_t size, int device, cudaStream_t stream) {
|
||||
void* ptr = nullptr;
|
||||
|
||||
// Set the device before allocating
|
||||
cudaError_t err = cudaSetDevice(device);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "[managed_alloc] cudaSetDevice(%d) failed: %s\n",
|
||||
device, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Use cudaMallocManaged - this is the key: allocations can page-fault
|
||||
// across HBM and LPDDR on GH200 with EGM enabled
|
||||
err = cudaMallocManaged(&ptr, size, cudaMemAttachGlobal);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "[managed_alloc] cudaMallocManaged failed: %s "
|
||||
"(size=%zu bytes / %.2f GiB)\n",
|
||||
cudaGetErrorString(err), size, (double)size / (1024.0*1024.0*1024.0));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Advise the driver to prefer GPU placement initially.
|
||||
// On GH200 with EGM, the hardware will migrate pages as needed.
|
||||
cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// PyTorch pluggable allocator signature: void(void*, size_t, int, cudaStream_t)
|
||||
void managed_free(void* ptr, size_t size, int device, cudaStream_t stream) {
|
||||
if (ptr != nullptr) {
|
||||
// Sync the stream before freeing to avoid use-after-free
|
||||
if (stream != nullptr) {
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
cudaFree(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
Reference in New Issue
Block a user