[Attention] Refactor AttentionMetadata Preparation for Encoder-only Models (#23154)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
@@ -203,6 +203,14 @@ class MambaSpec(KVCacheSpec):
|
||||
return self.page_size_bytes
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EncoderOnlyAttentionSpec(AttentionSpec):
|
||||
|
||||
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
|
||||
# Encoder-only layers do not need KV cache
|
||||
return 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class KVCacheTensor:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user