[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)

This commit is contained in:
PatchyTIS
2026-03-08 05:51:37 +08:00
committed by GitHub
parent ee54f9cdb9
commit a6be75dbd2
9 changed files with 940 additions and 12 deletions

View File

@@ -41,7 +41,7 @@ from .offload import OffloadConfig
from .parallel import ParallelConfig
from .profiler import ProfilerConfig
from .scheduler import SchedulerConfig
from .speculative import EagleModelTypes, SpeculativeConfig
from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
from .structured_outputs import StructuredOutputsConfig
from .utils import SupportsHash, config, replace
from .weight_transfer import WeightTransferConfig
@@ -696,11 +696,13 @@ class VllmConfig:
if self.speculative_config is not None:
if (
self.speculative_config.method not in get_args(EagleModelTypes)
and self.speculative_config.method not in get_args(NgramGPUTypes)
and self.speculative_config.method != "draft_model"
):
raise ValueError(
"Currently, async scheduling is only supported "
"with EAGLE/MTP/Draft Model kind of speculative decoding."
"with EAGLE/MTP/Draft Model/NGram GPU kind of "
"speculative decoding"
)
if self.speculative_config.disable_padded_drafter_batch:
raise ValueError(
@@ -718,6 +720,7 @@ class VllmConfig:
if (
self.speculative_config is not None
and self.speculative_config.method not in get_args(EagleModelTypes)
and self.speculative_config.method not in get_args(NgramGPUTypes)
):
logger.warning_once(
"Async scheduling not supported with %s-based "