[V1] [Hybrid] Mamba1 Automatic Prefix Caching (#26377)
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
73444b7b56
commit
00b31a36a2
@@ -299,7 +299,7 @@ class MambaModelConfig(VerifyAndUpdateConfig):
|
||||
if model_config.supports_mamba_prefix_caching:
|
||||
logger.info(
|
||||
"Warning: Prefix caching is currently enabled. "
|
||||
"Its support for Mamba2 layers is experimental. "
|
||||
"Its support for Mamba layers is experimental. "
|
||||
"Please report any issues you may observe."
|
||||
)
|
||||
else:
|
||||
|
||||
@@ -38,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.llama import LlamaMLP as JambaMLP
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
|
||||
from .interfaces import (
|
||||
HasInnerState,
|
||||
IsHybrid,
|
||||
SupportsLoRA,
|
||||
SupportsMambaPrefixCaching,
|
||||
SupportsPP,
|
||||
)
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
WeightsMapper,
|
||||
@@ -454,7 +460,14 @@ class JambaModel(nn.Module):
|
||||
return loaded_params
|
||||
|
||||
|
||||
class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid):
|
||||
class JambaForCausalLM(
|
||||
nn.Module,
|
||||
HasInnerState,
|
||||
SupportsLoRA,
|
||||
SupportsPP,
|
||||
IsHybrid,
|
||||
SupportsMambaPrefixCaching,
|
||||
):
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_substr={".self_attn.": ".", ".A_log": ".A"},
|
||||
)
|
||||
@@ -477,12 +490,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHyb
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
lora_config = vllm_config.lora_config
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
assert not cache_config.enable_prefix_caching, (
|
||||
"Jamba currently does not support prefix caching"
|
||||
)
|
||||
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
@@ -29,6 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.interfaces import (
|
||||
HasInnerState,
|
||||
IsAttentionFree,
|
||||
SupportsMambaPrefixCaching,
|
||||
SupportsPP,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@@ -193,15 +194,13 @@ class MambaModel(nn.Module):
|
||||
return loaded_params
|
||||
|
||||
|
||||
class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
|
||||
class MambaForCausalLM(
|
||||
nn.Module, HasInnerState, IsAttentionFree, SupportsPP, SupportsMambaPrefixCaching
|
||||
):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
lora_config = vllm_config.lora_config
|
||||
self.scheduler_config = vllm_config.scheduler_config
|
||||
assert not cache_config.enable_prefix_caching, (
|
||||
"Mamba does not support prefix caching"
|
||||
)
|
||||
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
Reference in New Issue
Block a user