[V1] [Hybrid] Refactor mamba state shape calculation; enable V1 via cli (#20840)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
@@ -1630,6 +1630,9 @@ class CacheConfig:
|
||||
checkpoint if available. Otherwise, the scales will default to 1.0."""
|
||||
cpu_kvcache_space_bytes: Optional[int] = None
|
||||
"""(CPU backend only) CPU key-value cache space."""
|
||||
mamba_page_size_padded: Optional[int] = None
|
||||
""" Optional override for mamba page size; used by hybrid mamba/attention
|
||||
models to ensure exact alignment with attention page size."""
|
||||
|
||||
# Will be set after profiling.
|
||||
num_gpu_blocks: Optional[int] = field(default=None, init=False)
|
||||
@@ -4882,11 +4885,15 @@ class VllmConfig:
|
||||
if architecture is None:
|
||||
return
|
||||
|
||||
from vllm.model_executor.models.config import MODELS_CONFIG_MAP
|
||||
from vllm.model_executor.models.config import (
|
||||
MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
|
||||
cls = MODELS_CONFIG_MAP.get(architecture, None)
|
||||
if cls is not None:
|
||||
cls.verify_and_update_config(self)
|
||||
|
||||
if self.model_config.is_hybrid:
|
||||
HybridAttentionMambaModelConfig.verify_and_update_config(self)
|
||||
|
||||
if self.model_config.task == "classify":
|
||||
# Maybe convert ForCausalLM into ForSequenceClassification model.
|
||||
from vllm.model_executor.models.adapters import (
|
||||
|
||||
Reference in New Issue
Block a user