[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)

This commit is contained in:
PatchyTIS
2026-03-08 05:51:37 +08:00
committed by GitHub
parent ee54f9cdb9
commit a6be75dbd2
9 changed files with 940 additions and 12 deletions

View File

@@ -47,6 +47,7 @@ MTPModelTypes = Literal[
"step3p5_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
NgramGPUTypes = Literal["ngram_gpu"]
SpeculativeMethod = Literal[
"ngram",
"medusa",
@@ -54,6 +55,7 @@ SpeculativeMethod = Literal[
"draft_model",
"suffix",
EagleModelTypes,
NgramGPUTypes,
]
@@ -364,6 +366,8 @@ class SpeculativeConfig:
self.quantization = self.target_model_config.quantization
elif self.method in ("ngram", "[ngram]"):
self.model = "ngram"
elif self.method == "ngram_gpu":
self.model = "ngram_gpu"
elif self.method == "suffix":
self.model = "suffix"
elif self.method == "extract_hidden_states":
@@ -374,8 +378,9 @@ class SpeculativeConfig:
)
if self.method in ("ngram", "[ngram]"):
# Unified to "ngram" internally
self.method = "ngram"
if self.method in ("ngram", "ngram_gpu"):
# Set default values if not provided
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
@@ -832,6 +837,9 @@ class SpeculativeConfig:
def uses_extract_hidden_states(self) -> bool:
return self.method == "extract_hidden_states"
def use_ngram_gpu(self) -> bool:
return self.method == "ngram_gpu"
def __repr__(self) -> str:
method = self.method
model = (

View File

@@ -41,7 +41,7 @@ from .offload import OffloadConfig
from .parallel import ParallelConfig
from .profiler import ProfilerConfig
from .scheduler import SchedulerConfig
from .speculative import EagleModelTypes, SpeculativeConfig
from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
from .structured_outputs import StructuredOutputsConfig
from .utils import SupportsHash, config, replace
from .weight_transfer import WeightTransferConfig
@@ -696,11 +696,13 @@ class VllmConfig:
if self.speculative_config is not None:
if (
self.speculative_config.method not in get_args(EagleModelTypes)
and self.speculative_config.method not in get_args(NgramGPUTypes)
and self.speculative_config.method != "draft_model"
):
raise ValueError(
"Currently, async scheduling is only supported "
"with EAGLE/MTP/Draft Model kind of speculative decoding."
"with EAGLE/MTP/Draft Model/NGram GPU kind of "
"speculative decoding"
)
if self.speculative_config.disable_padded_drafter_batch:
raise ValueError(
@@ -718,6 +720,7 @@ class VllmConfig:
if (
self.speculative_config is not None
and self.speculative_config.method not in get_args(EagleModelTypes)
and self.speculative_config.method not in get_args(NgramGPUTypes)
):
logger.warning_once(
"Async scheduling not supported with %s-based "