Remove global allocator swap, use targeted KV cache managed allocation
sitecustomize.py: No longer swaps CUDAPluggableAllocator globally. Sets VLLM_KV_CACHE_USE_MANAGED_MEMORY=1 instead. vllm_managed_mem.py: No global allocator swap, no torch.cuda patches.
This commit is contained in:
@@ -216,8 +216,13 @@ def patch_vllm_memory_check():
|
||||
|
||||
|
||||
def main():
|
||||
# Step 1: Swap allocator BEFORE any CUDA ops
|
||||
swap_allocator()
|
||||
# Step 1: NO global allocator swap — model weights stay in HBM.
|
||||
# KV cache uses cudaMallocManaged directly via
|
||||
# VLLM_KV_CACHE_USE_MANAGED_MEMORY env var (set by sitecustomize.py).
|
||||
# The global allocator swap broke cuBLAS GEMM operations because
|
||||
# intermediate compute tensors ended up in managed memory.
|
||||
print(f"[managed_mem] Using targeted KV cache managed allocation "
|
||||
f"(no global allocator swap)", file=sys.stderr)
|
||||
|
||||
# Step 2: Calculate total managed memory and export it
|
||||
total_managed_gb = get_total_managed_memory_gb()
|
||||
@@ -231,8 +236,8 @@ def main():
|
||||
print(f"[managed_mem] MANAGED_MEMORY_TOTAL_GB={total_managed_gb:.0f}",
|
||||
file=sys.stderr)
|
||||
|
||||
# Step 3: Patch PyTorch memory tracking (pluggable allocator doesn't support all ops)
|
||||
patch_torch_memory_tracking()
|
||||
# Step 3: No torch.cuda memory tracking patches needed —
|
||||
# we're not using CUDAPluggableAllocator anymore.
|
||||
|
||||
# Step 4: Patch MemorySnapshot.measure() to report full managed memory
|
||||
# This is critical - without it, all downstream code only sees HBM
|
||||
|
||||
Reference in New Issue
Block a user