[Spec Decode] Enable FlashInfer Spec Decoding (#25196)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: lhsjohn <huashuoli@tencent.com>
This commit is contained in:
committed by
GitHub
parent
77d906995c
commit
c30b405b8f
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
from typing import ClassVar
|
||||
|
||||
import torch
|
||||
|
||||
@@ -35,7 +34,7 @@ class LinearAttentionMetadata:
|
||||
class LinearAttentionMetadataBuilder(
|
||||
AttentionMetadataBuilder[LinearAttentionMetadata]):
|
||||
|
||||
reorder_batch_threshold: ClassVar[int] = 1
|
||||
reorder_batch_threshold: int = 1
|
||||
|
||||
def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
|
||||
vllm_config: VllmConfig, device: torch.device):
|
||||
|
||||
Reference in New Issue
Block a user