[Hybrid] Enable mamba prefix cache "align" mode with async scheduling (#33997)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2026-02-14 22:15:56 +01:00
parent 73391a1baa
commit d5fe3f702c
4 changed files with 77 additions and 32 deletions
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -648,11 +648,6 @@ class VllmConfig:
                    "`external_launcher` distributed executor backend, but you chose "
                    f"`{executor_backend}`."
                )
-            if self.cache_config.mamba_cache_mode != "none":
-                raise ValueError(
-                    "Currently, async scheduling is not compatible with "
-                    "prefix caching for Mamba models."
-                )
        elif self.scheduler_config.async_scheduling is None:
            # Enable async scheduling unless there is an incompatible option.
            if (
@@ -685,13 +680,6 @@ class VllmConfig:
                    scope="local",
                )
                self.scheduler_config.async_scheduling = False
-            elif self.cache_config.mamba_cache_mode != "none":
-                logger.warning_once(
-                    "Async scheduling is not compatible with "
-                    "prefix caching for Mamba models and will be disabled.",
-                    scope="local",
-                )
-                self.scheduler_config.async_scheduling = False
            else:
                self.scheduler_config.async_scheduling = True