[V1] [Hybrid] Disable prefix caching by default for hybrid or mamba-based models (#23716)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
Thomas Parnell
2025-08-27 14:51:54 +02:00
committed by GitHub
parent a403d0fa41
commit 704432af3c
2 changed files with 11 additions and 8 deletions

View File

@@ -292,12 +292,13 @@ class MambaModelConfig(VerifyAndUpdateConfig):
return
model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
compilation_config = vllm_config.compilation_config
model_cls, _ = ModelRegistry.resolve_model_cls(
model_config.architecture,
model_config=model_config,
)
# TODO(tdoublep): remove once prefix caching is enabled
cache_config.enable_prefix_caching = False
logger.info("Hybrid or mamba-based model detected: disabling prefix "
"caching since it is not yet supported.")
# TODO(tdoublep): remove as full cuda graph support is added
FCG_NOT_SUPPORTED_MODELS = [