[V1][Hybrid] Mamba Prefix Caching with align mode (#30877)
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
@@ -35,6 +35,8 @@ from vllm.model_executor.layers.linear import (
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01LinearAttention
|
||||
from vllm.model_executor.layers.mamba.mamba_utils import (
|
||||
MambaStateCopyFunc,
|
||||
MambaStateCopyFuncCalculator,
|
||||
MambaStateDtypeCalculator,
|
||||
MambaStateShapeCalculator,
|
||||
)
|
||||
@@ -1006,3 +1008,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
|
||||
tp_size=parallel_config.tensor_parallel_size,
|
||||
head_dim=hf_config.head_dim,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_mamba_state_copy_func(cls) -> tuple[MambaStateCopyFunc]:
|
||||
return MambaStateCopyFuncCalculator.linear_attention_state_copy_func()
|
||||
|
||||
Reference in New Issue
Block a user