[Spec Decode] Enable FlashInfer Spec Decoding (#25196)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: lhsjohn <huashuoli@tencent.com>
2025-09-23 22:29:58 -04:00
parent 77d906995c
commit c30b405b8f
12 changed files with 250 additions and 49 deletions
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar

 import torch

@@ -35,7 +34,7 @@ class LinearAttentionMetadata:
 class LinearAttentionMetadataBuilder(
        AttentionMetadataBuilder[LinearAttentionMetadata]):

-    reorder_batch_threshold: ClassVar[int] = 1
+    reorder_batch_threshold: int = 1

    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                 vllm_config: VllmConfig, device: torch.device):