[V1] [Hybrid] Disable prefix caching by default for hybrid or mamba-based models (#23716)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-27 14:51:54 +02:00
parent a403d0fa41
commit 704432af3c
2 changed files with 11 additions and 8 deletions
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -292,12 +292,13 @@ class MambaModelConfig(VerifyAndUpdateConfig):
            return

        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
        compilation_config = vllm_config.compilation_config

-        model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config.architecture,
-            model_config=model_config,
-        )
+        # TODO(tdoublep): remove once prefix caching is enabled
+        cache_config.enable_prefix_caching = False
+        logger.info("Hybrid or mamba-based model detected: disabling prefix "
+                    "caching since it is not yet supported.")

        # TODO(tdoublep): remove as full cuda graph support is added
        FCG_NOT_SUPPORTED_MODELS = [