Remove global allocator swap, use targeted KV cache managed allocation

sitecustomize.py: No longer swaps CUDAPluggableAllocator globally.
Sets VLLM_KV_CACHE_USE_MANAGED_MEMORY=1 instead.
vllm_managed_mem.py: No global allocator swap, no torch.cuda patches.
This commit is contained in:
2026-04-11 02:15:09 +00:00
parent 07468031db
commit bcc872c2c3
2 changed files with 25 additions and 72 deletions

View File

@@ -216,8 +216,13 @@ def patch_vllm_memory_check():
def main():
# Step 1: Swap allocator BEFORE any CUDA ops
swap_allocator()
# Step 1: NO global allocator swap — model weights stay in HBM.
# KV cache uses cudaMallocManaged directly via
# VLLM_KV_CACHE_USE_MANAGED_MEMORY env var (set by sitecustomize.py).
# The global allocator swap broke cuBLAS GEMM operations because
# intermediate compute tensors ended up in managed memory.
print(f"[managed_mem] Using targeted KV cache managed allocation "
f"(no global allocator swap)", file=sys.stderr)
# Step 2: Calculate total managed memory and export it
total_managed_gb = get_total_managed_memory_gb()
@@ -231,8 +236,8 @@ def main():
print(f"[managed_mem] MANAGED_MEMORY_TOTAL_GB={total_managed_gb:.0f}",
file=sys.stderr)
# Step 3: Patch PyTorch memory tracking (pluggable allocator doesn't support all ops)
patch_torch_memory_tracking()
# Step 3: No torch.cuda memory tracking patches needed —
# we're not using CUDAPluggableAllocator anymore.
# Step 4: Patch MemorySnapshot.measure() to report full managed memory
# This is critical - without it, all downstream code only sees HBM