From d0b402974ffa2c26090ab0d816288b4bcd09f761 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:33:19 -0400 Subject: [PATCH] [Bugfix][Spec Decode] Avoid double call of Ngram CPU (#36952) Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- vllm/v1/worker/gpu_model_runner.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b53bd71a1..f092a47fe 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4247,15 +4247,6 @@ class GPUModelRunner( self.input_batch.token_ids_cpu, slot_mappings=slot_mappings, ) - if isinstance(self.drafter, NgramProposer): - assert isinstance(sampled_token_ids, list), ( - "sampled_token_ids should be a python list when ngram is used." - ) - draft_token_ids = self.drafter.propose( - sampled_token_ids, - self.input_batch.num_tokens_no_spec, - self.input_batch.token_ids_cpu, - ) elif spec_config.use_ngram_gpu(): assert isinstance(self.drafter, NgramProposerGPU) (