[Perf] Refactor cudagraph_support to enable full CUDA graphs for spec decoding with FlashInfer (#28479)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
This commit is contained in:
Benjamin Chislett
2025-11-12 11:56:40 -05:00
committed by GitHub
parent a742134cc5
commit 304419576a
18 changed files with 71 additions and 41 deletions

View File

@@ -67,7 +67,7 @@ class TritonAttentionMetadata:
class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
_cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
def __init__(
self,