[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)

2026-03-08 05:51:37 +08:00
parent ee54f9cdb9
commit a6be75dbd2
9 changed files with 940 additions and 12 deletions
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -47,6 +47,7 @@ MTPModelTypes = Literal[
    "step3p5_mtp",
 ]
 EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
+NgramGPUTypes = Literal["ngram_gpu"]
 SpeculativeMethod = Literal[
    "ngram",
    "medusa",
@@ -54,6 +55,7 @@ SpeculativeMethod = Literal[
    "draft_model",
    "suffix",
    EagleModelTypes,
+    NgramGPUTypes,
 ]


@@ -364,6 +366,8 @@ class SpeculativeConfig:
                    self.quantization = self.target_model_config.quantization
            elif self.method in ("ngram", "[ngram]"):
                self.model = "ngram"
+            elif self.method == "ngram_gpu":
+                self.model = "ngram_gpu"
            elif self.method == "suffix":
                self.model = "suffix"
            elif self.method == "extract_hidden_states":
@@ -374,8 +378,9 @@ class SpeculativeConfig:
                )

        if self.method in ("ngram", "[ngram]"):
-            # Unified to "ngram" internally
            self.method = "ngram"
+
+        if self.method in ("ngram", "ngram_gpu"):
            # Set default values if not provided
            if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
                # TODO(woosuk): Tune these values. They are arbitrarily chosen.
@@ -832,6 +837,9 @@ class SpeculativeConfig:
    def uses_extract_hidden_states(self) -> bool:
        return self.method == "extract_hidden_states"

+    def use_ngram_gpu(self) -> bool:
+        return self.method == "ngram_gpu"
+
    def __repr__(self) -> str:
        method = self.method
        model = (
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -41,7 +41,7 @@ from .offload import OffloadConfig
 from .parallel import ParallelConfig
 from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
-from .speculative import EagleModelTypes, SpeculativeConfig
+from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
 from .utils import SupportsHash, config, replace
 from .weight_transfer import WeightTransferConfig
@@ -696,11 +696,13 @@ class VllmConfig:
            if self.speculative_config is not None:
                if (
                    self.speculative_config.method not in get_args(EagleModelTypes)
+                    and self.speculative_config.method not in get_args(NgramGPUTypes)
                    and self.speculative_config.method != "draft_model"
                ):
                    raise ValueError(
                        "Currently, async scheduling is only supported "
-                        "with EAGLE/MTP/Draft Model kind of speculative decoding."
+                        "with EAGLE/MTP/Draft Model/NGram GPU kind of "
+                        "speculative decoding"
                    )
                if self.speculative_config.disable_padded_drafter_batch:
                    raise ValueError(
@@ -718,6 +720,7 @@ class VllmConfig:
            if (
                self.speculative_config is not None
                and self.speculative_config.method not in get_args(EagleModelTypes)
+                and self.speculative_config.method not in get_args(NgramGPUTypes)
            ):
                logger.warning_once(
                    "Async scheduling not supported with %s-based "