[BUGFIX][Mamba][Qwen3.5] Zero freed SSM cache blocks on GPU (#35219)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2026-03-10 14:32:20 +04:00
parent 507ddbe992
commit 4ff8c3c8f9
10 changed files with 287 additions and 8 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -556,6 +556,14 @@ class Worker(WorkerBase):
        else:
            self.model_runner.initialize_kv_cache(kv_cache_config)

+        # Build KV-zero metadata outside the CuMem pool so the bookkeeping
+        # GPU tensors (seg_addrs, block-id buffers) use the standard PyTorch
+        # allocator and are not discarded during sleep/wake cycles.
+        if kv_cache_config.needs_kv_cache_zeroing and hasattr(
+            self.model_runner, "_init_kv_zero_meta"
+        ):
+            self.model_runner._init_kv_zero_meta()
+
    @instrument(span_name="Warmup (GPU)")
    def compile_or_warm_up_model(self) -> float:
        warmup_sizes: list[int] = []