diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bef67486d..4c919b392 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1355,10 +1355,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask - # If the grammar bitmask and the logits have the same shape + # If the length of out indices and the logits have the same shape # we don't need to pass indices to the kernel, # since the bitmask is already aligned with the logits. - skip_out_indices = grammar_bitmask.shape[0] == logits.shape[0] + skip_out_indices = len(out_indices) == logits.shape[0] # Serialization of np.ndarray is much more efficient than a tensor, # so we receive it in that format.