[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)

2026-03-08 05:51:37 +08:00
parent ee54f9cdb9
commit a6be75dbd2
9 changed files with 940 additions and 12 deletions
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -41,7 +41,7 @@ from .offload import OffloadConfig
 from .parallel import ParallelConfig
 from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
-from .speculative import EagleModelTypes, SpeculativeConfig
+from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
 from .utils import SupportsHash, config, replace
 from .weight_transfer import WeightTransferConfig
@@ -696,11 +696,13 @@ class VllmConfig:
            if self.speculative_config is not None:
                if (
                    self.speculative_config.method not in get_args(EagleModelTypes)
+                    and self.speculative_config.method not in get_args(NgramGPUTypes)
                    and self.speculative_config.method != "draft_model"
                ):
                    raise ValueError(
                        "Currently, async scheduling is only supported "
-                        "with EAGLE/MTP/Draft Model kind of speculative decoding."
+                        "with EAGLE/MTP/Draft Model/NGram GPU kind of "
+                        "speculative decoding"
                    )
                if self.speculative_config.disable_padded_drafter_batch:
                    raise ValueError(
@@ -718,6 +720,7 @@ class VllmConfig:
            if (
                self.speculative_config is not None
                and self.speculative_config.method not in get_args(EagleModelTypes)
+                and self.speculative_config.method not in get_args(NgramGPUTypes)
            ):
                logger.warning_once(
                    "Async scheduling not supported with %s-based "