[Attention][UX][1/N] Add AttentionConfig and change attention env vars to CLI arguments (#26315)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
This commit is contained in:
Matthew Bonanni
2025-12-05 12:48:43 -05:00
committed by GitHub
parent dff0a2b394
commit 66e674cdd5
22 changed files with 367 additions and 325 deletions

View File

@@ -210,9 +210,6 @@ class TritonAttentionImpl(AttentionImpl):
def fused_output_quant_supported(self, quant_key: QuantKey):
return quant_key == kFp8StaticTensorSym
def supports_quant_query_input(self) -> bool:
return current_platform.is_cuda()
def __init__(
self,
num_heads: int,
@@ -262,6 +259,8 @@ class TritonAttentionImpl(AttentionImpl):
f"num_heads: {num_heads}."
)
self.supports_quant_query_input = current_platform.is_cuda()
def forward(
self,
layer: torch.nn.Module,