[BUGFIX][Mamba][Qwen3.5] Zero freed SSM cache blocks on GPU (#35219)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
This commit is contained in:
@@ -556,6 +556,14 @@ class Worker(WorkerBase):
|
||||
else:
|
||||
self.model_runner.initialize_kv_cache(kv_cache_config)
|
||||
|
||||
# Build KV-zero metadata outside the CuMem pool so the bookkeeping
|
||||
# GPU tensors (seg_addrs, block-id buffers) use the standard PyTorch
|
||||
# allocator and are not discarded during sleep/wake cycles.
|
||||
if kv_cache_config.needs_kv_cache_zeroing and hasattr(
|
||||
self.model_runner, "_init_kv_zero_meta"
|
||||
):
|
||||
self.model_runner._init_kv_zero_meta()
|
||||
|
||||
@instrument(span_name="Warmup (GPU)")
|
||||
def compile_or_warm_up_model(self) -> float:
|
||||
warmup_sizes: list[int] = []
|
||||
|
||||
Reference in New Issue
Block a user