[Spec Decode] Enable FlashInfer Spec Decoding (#25196)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: lhsjohn <huashuoli@tencent.com>
This commit is contained in:
Benjamin Chislett
2025-09-23 22:29:58 -04:00
committed by GitHub
parent 77d906995c
commit c30b405b8f
12 changed files with 250 additions and 49 deletions

View File

@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from typing import ClassVar
import torch
@@ -35,7 +34,7 @@ class LinearAttentionMetadata:
class LinearAttentionMetadataBuilder(
AttentionMetadataBuilder[LinearAttentionMetadata]):
reorder_batch_threshold: ClassVar[int] = 1
reorder_batch_threshold: int = 1
def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
vllm_config: VllmConfig, device: torch.device):