Remove global allocator swap, use targeted KV cache managed allocation

sitecustomize.py: No longer swaps CUDAPluggableAllocator globally. Sets VLLM_KV_CACHE_USE_MANAGED_MEMORY=1 instead. vllm_managed_mem.py: No global allocator swap, no torch.cuda patches.
2026-04-11 02:15:09 +00:00
parent 07468031db
commit bcc872c2c3
2 changed files with 25 additions and 72 deletions
--- a/vllm/vllm_managed_mem.py
+++ b/vllm/vllm_managed_mem.py
@@ -216,8 +216,13 @@ def patch_vllm_memory_check():


 def main():
-    # Step 1: Swap allocator BEFORE any CUDA ops
-    swap_allocator()
+    # Step 1: NO global allocator swap — model weights stay in HBM.
+    # KV cache uses cudaMallocManaged directly via
+    # VLLM_KV_CACHE_USE_MANAGED_MEMORY env var (set by sitecustomize.py).
+    # The global allocator swap broke cuBLAS GEMM operations because
+    # intermediate compute tensors ended up in managed memory.
+    print(f"[managed_mem] Using targeted KV cache managed allocation "
+          f"(no global allocator swap)", file=sys.stderr)

    # Step 2: Calculate total managed memory and export it
    total_managed_gb = get_total_managed_memory_gb()
@@ -231,8 +236,8 @@ def main():
    print(f"[managed_mem] MANAGED_MEMORY_TOTAL_GB={total_managed_gb:.0f}",
          file=sys.stderr)

-    # Step 3: Patch PyTorch memory tracking (pluggable allocator doesn't support all ops)
-    patch_torch_memory_tracking()
+    # Step 3: No torch.cuda memory tracking patches needed —
+    # we're not using CUDAPluggableAllocator anymore.

    # Step 4: Patch MemorySnapshot.measure() to report full managed memory
    # This is critical - without it, all downstream code only sees HBM