[Bugfix] Lazy import NgramProposer in GPU model runner (#32821)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
This commit is contained in:
22quinn
2026-01-27 21:07:16 -08:00
committed by GitHub
parent 35fb0b8613
commit a2b877df6c

View File

@@ -150,7 +150,6 @@ from vllm.v1.spec_decode.draft_model import DraftModelProposer
from vllm.v1.spec_decode.eagle import EagleProposer
from vllm.v1.spec_decode.medusa import MedusaProposer
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
from vllm.v1.structured_output.utils import apply_grammar_bitmask
from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
@@ -185,6 +184,7 @@ from .utils import (
if TYPE_CHECKING:
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
logger = init_logger(__name__)
@@ -439,13 +439,15 @@ class GPUModelRunner(
# layers in the draft model.
if self.speculative_config and get_pp_group().is_last_rank:
self.drafter: (
NgramProposer
NgramProposer # noqa: F823
| SuffixDecodingProposer
| EagleProposer
| DraftModelProposer
| MedusaProposer
)
if self.speculative_config.method == "ngram":
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
self.drafter = NgramProposer(self.vllm_config)
elif self.speculative_config.uses_draft_model():
self.drafter = DraftModelProposer(
@@ -3848,6 +3850,8 @@ class GPUModelRunner(
spec_config = self.speculative_config
assert spec_config is not None
if spec_config.method == "ngram":
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
assert isinstance(sampled_token_ids, list)
assert isinstance(self.drafter, NgramProposer)
draft_token_ids = self.drafter.propose(