[V1] [Hybrid] Disable prefix caching by default for hybrid or mamba-based models (#23716)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
@@ -292,12 +292,13 @@ class MambaModelConfig(VerifyAndUpdateConfig):
|
||||
return
|
||||
|
||||
model_config = vllm_config.model_config
|
||||
cache_config = vllm_config.cache_config
|
||||
compilation_config = vllm_config.compilation_config
|
||||
|
||||
model_cls, _ = ModelRegistry.resolve_model_cls(
|
||||
model_config.architecture,
|
||||
model_config=model_config,
|
||||
)
|
||||
# TODO(tdoublep): remove once prefix caching is enabled
|
||||
cache_config.enable_prefix_caching = False
|
||||
logger.info("Hybrid or mamba-based model detected: disabling prefix "
|
||||
"caching since it is not yet supported.")
|
||||
|
||||
# TODO(tdoublep): remove as full cuda graph support is added
|
||||
FCG_NOT_SUPPORTED_MODELS = [
|
||||
|
||||
Reference in New Issue
Block a user