[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)
This commit is contained in:
@@ -47,6 +47,7 @@ MTPModelTypes = Literal[
|
||||
"step3p5_mtp",
|
||||
]
|
||||
EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
|
||||
NgramGPUTypes = Literal["ngram_gpu"]
|
||||
SpeculativeMethod = Literal[
|
||||
"ngram",
|
||||
"medusa",
|
||||
@@ -54,6 +55,7 @@ SpeculativeMethod = Literal[
|
||||
"draft_model",
|
||||
"suffix",
|
||||
EagleModelTypes,
|
||||
NgramGPUTypes,
|
||||
]
|
||||
|
||||
|
||||
@@ -364,6 +366,8 @@ class SpeculativeConfig:
|
||||
self.quantization = self.target_model_config.quantization
|
||||
elif self.method in ("ngram", "[ngram]"):
|
||||
self.model = "ngram"
|
||||
elif self.method == "ngram_gpu":
|
||||
self.model = "ngram_gpu"
|
||||
elif self.method == "suffix":
|
||||
self.model = "suffix"
|
||||
elif self.method == "extract_hidden_states":
|
||||
@@ -374,8 +378,9 @@ class SpeculativeConfig:
|
||||
)
|
||||
|
||||
if self.method in ("ngram", "[ngram]"):
|
||||
# Unified to "ngram" internally
|
||||
self.method = "ngram"
|
||||
|
||||
if self.method in ("ngram", "ngram_gpu"):
|
||||
# Set default values if not provided
|
||||
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
|
||||
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
|
||||
@@ -832,6 +837,9 @@ class SpeculativeConfig:
|
||||
def uses_extract_hidden_states(self) -> bool:
|
||||
return self.method == "extract_hidden_states"
|
||||
|
||||
def use_ngram_gpu(self) -> bool:
|
||||
return self.method == "ngram_gpu"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
method = self.method
|
||||
model = (
|
||||
|
||||
@@ -41,7 +41,7 @@ from .offload import OffloadConfig
|
||||
from .parallel import ParallelConfig
|
||||
from .profiler import ProfilerConfig
|
||||
from .scheduler import SchedulerConfig
|
||||
from .speculative import EagleModelTypes, SpeculativeConfig
|
||||
from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
|
||||
from .structured_outputs import StructuredOutputsConfig
|
||||
from .utils import SupportsHash, config, replace
|
||||
from .weight_transfer import WeightTransferConfig
|
||||
@@ -696,11 +696,13 @@ class VllmConfig:
|
||||
if self.speculative_config is not None:
|
||||
if (
|
||||
self.speculative_config.method not in get_args(EagleModelTypes)
|
||||
and self.speculative_config.method not in get_args(NgramGPUTypes)
|
||||
and self.speculative_config.method != "draft_model"
|
||||
):
|
||||
raise ValueError(
|
||||
"Currently, async scheduling is only supported "
|
||||
"with EAGLE/MTP/Draft Model kind of speculative decoding."
|
||||
"with EAGLE/MTP/Draft Model/NGram GPU kind of "
|
||||
"speculative decoding"
|
||||
)
|
||||
if self.speculative_config.disable_padded_drafter_batch:
|
||||
raise ValueError(
|
||||
@@ -718,6 +720,7 @@ class VllmConfig:
|
||||
if (
|
||||
self.speculative_config is not None
|
||||
and self.speculative_config.method not in get_args(EagleModelTypes)
|
||||
and self.speculative_config.method not in get_args(NgramGPUTypes)
|
||||
):
|
||||
logger.warning_once(
|
||||
"Async scheduling not supported with %s-based "
|
||||
|
||||
Reference in New Issue
Block a user