[V1] [Hybrid] Refactor mamba state shape calculation; enable V1 via cli (#20840)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
Thomas Parnell
2025-07-15 13:04:35 +02:00
committed by GitHub
parent c586b55667
commit 3534c39a20
14 changed files with 441 additions and 353 deletions

View File

@@ -1630,6 +1630,9 @@ class CacheConfig:
checkpoint if available. Otherwise, the scales will default to 1.0."""
cpu_kvcache_space_bytes: Optional[int] = None
"""(CPU backend only) CPU key-value cache space."""
mamba_page_size_padded: Optional[int] = None
""" Optional override for mamba page size; used by hybrid mamba/attention
models to ensure exact alignment with attention page size."""
# Will be set after profiling.
num_gpu_blocks: Optional[int] = field(default=None, init=False)
@@ -4882,11 +4885,15 @@ class VllmConfig:
if architecture is None:
return
from vllm.model_executor.models.config import MODELS_CONFIG_MAP
from vllm.model_executor.models.config import (
MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
cls = MODELS_CONFIG_MAP.get(architecture, None)
if cls is not None:
cls.verify_and_update_config(self)
if self.model_config.is_hybrid:
HybridAttentionMambaModelConfig.verify_and_update_config(self)
if self.model_config.task == "classify":
# Maybe convert ForCausalLM into ForSequenceClassification model.
from vllm.model_executor.models.adapters import (