[Attention] Make local attention backend agnostic (#21093)

This commit is contained in:
Lucas Wilkinson
2025-07-18 00:10:42 -04:00
committed by GitHub
parent b9a21e9173
commit 89cab4d01f
8 changed files with 94 additions and 242 deletions

View File

@@ -125,6 +125,21 @@ class FullAttentionSpec(AttentionSpec):
return merged_spec
@dataclass
class ChunkedLocalAttentionSpec(AttentionSpec):
attention_chunk_size: int
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
max_model_len = vllm_config.model_config.max_model_len
return cdiv(max_model_len, self.block_size) * self.page_size_bytes
@property
def type_id(self) -> str:
return (
f"local_attention_{self.attention_chunk_size}_{self.block_size}_{self.page_size_bytes}"
) # noqa
@dataclass
class SlidingWindowSpec(AttentionSpec):
sliding_window: int