[Misc] Refactor get_kv_cache_spec into AttentionLayerBase (#26587)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -481,7 +481,7 @@ class DeepseekV32IndexerCache(torch.nn.Module, AttentionLayerBase):
|
||||
raise ValueError(f"Duplicate layer name: {prefix}")
|
||||
compilation_config.static_forward_context[prefix] = self
|
||||
|
||||
def get_kv_cache_spec(self) -> KVCacheSpec:
|
||||
def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
|
||||
return MLAAttentionSpec( # Only has one vector instead of K + V
|
||||
block_size=self.cache_config.block_size,
|
||||
num_kv_heads=1,
|
||||
|
||||
Reference in New Issue
Block a user