From e9c83cdc51f7d9fc642599057ec490e291ea7be3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 19 Jan 2026 22:20:19 -0800 Subject: [PATCH] [Model Runner V2] Skip kernel launch for penalties & logit_bias (#32634) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/sample/logit_bias.py | 19 ++++++++++++++++++- vllm/v1/worker/gpu/sample/penalties.py | 17 +++++++++++++++-- vllm/v1/worker/gpu/sample/sampler.py | 4 ++-- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu/sample/logit_bias.py b/vllm/v1/worker/gpu/sample/logit_bias.py index f959b36e4..6cd55a7e5 100644 --- a/vllm/v1/worker/gpu/sample/logit_bias.py +++ b/vllm/v1/worker/gpu/sample/logit_bias.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import numpy as np import torch from vllm.sampling_params import SamplingParams @@ -49,12 +50,18 @@ class LogitBiasState: device=device, ) + # Using any of the above. + self.use_logit_bias = np.zeros(max_num_reqs, dtype=bool) + def add_request( self, req_idx: int, prompt_len: int, sampling_params: SamplingParams, ) -> None: + # Using any logit bias. + use_logit_bias = False + # Allowed token IDs. allowed_token_ids = sampling_params.allowed_token_ids if allowed_token_ids: @@ -66,6 +73,7 @@ class LogitBiasState: ) self.num_allowed_token_ids.np[req_idx] = num_allowed_token_ids self.allowed_token_ids.stage_write(req_idx, 0, allowed_token_ids) + use_logit_bias = True else: self.num_allowed_token_ids.np[req_idx] = 0 @@ -81,6 +89,7 @@ class LogitBiasState: self.num_logit_bias.np[req_idx] = num_logit_bias self.logit_bias_token_ids.stage_write(req_idx, 0, logit_bias.keys()) self.logit_bias.stage_write(req_idx, 0, logit_bias.values()) + use_logit_bias = True else: self.num_logit_bias.np[req_idx] = 0 @@ -89,7 +98,7 @@ class LogitBiasState: min_len = prompt_len + min_tokens self.min_lens.np[req_idx] = min_len stop_token_ids = sampling_params.all_stop_token_ids - if stop_token_ids: + if min_tokens > 0 and stop_token_ids: num_stop_token_ids = len(stop_token_ids) if num_stop_token_ids > MAX_NUM_STOP_TOKEN_IDS: raise ValueError( @@ -98,9 +107,12 @@ class LogitBiasState: ) self.num_stop_token_ids.np[req_idx] = num_stop_token_ids self.stop_token_ids.stage_write(req_idx, 0, stop_token_ids) + use_logit_bias = True else: self.num_stop_token_ids.np[req_idx] = 0 + self.use_logit_bias[req_idx] = use_logit_bias + def apply_staged_writes(self) -> None: self.num_allowed_token_ids.copy_to_uva() self.allowed_token_ids.apply_write() @@ -117,8 +129,13 @@ class LogitBiasState: self, logits: torch.Tensor, idx_mapping: torch.Tensor, + idx_mapping_np: np.ndarray, pos: torch.Tensor, ) -> None: + if not np.any(self.use_logit_bias[idx_mapping_np]): + # No request uses logit bias. Skip the kernel launch. + return + apply_logit_bias( logits, idx_mapping, diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py index 4f0ce905f..2e6194df5 100644 --- a/vllm/v1/worker/gpu/sample/penalties.py +++ b/vllm/v1/worker/gpu/sample/penalties.py @@ -18,6 +18,7 @@ class PenaltiesState: self.repetition_penalty = UvaBackedTensor(max_num_reqs, dtype=torch.float32) self.frequency_penalty = UvaBackedTensor(max_num_reqs, dtype=torch.float32) self.presence_penalty = UvaBackedTensor(max_num_reqs, dtype=torch.float32) + self.use_penalty = np.zeros(max_num_reqs, dtype=bool) # Initialize repetition penalty manually because 0 is an invalid value for it. self.repetition_penalty.np.fill(1.0) @@ -42,7 +43,10 @@ class PenaltiesState: self.repetition_penalty.np[req_idx] = sampling_params.repetition_penalty self.frequency_penalty.np[req_idx] = sampling_params.frequency_penalty self.presence_penalty.np[req_idx] = sampling_params.presence_penalty - if use_penalty(sampling_params): + + do_penalty = use_penalty(sampling_params) + self.use_penalty[req_idx] = do_penalty + if do_penalty: self._penalties_reqs.append(req_idx) def apply_staged_writes( @@ -66,7 +70,16 @@ class PenaltiesState: self.frequency_penalty.copy_to_uva() self.presence_penalty.copy_to_uva() - def apply_penalties(self, logits: torch.Tensor, idx_mapping: torch.Tensor) -> None: + def apply_penalties( + self, + logits: torch.Tensor, + idx_mapping: torch.Tensor, + idx_mapping_np: np.ndarray, + ) -> None: + if not np.any(self.use_penalty[idx_mapping_np]): + # No request uses penalties. Skip the kernel launch. + return + apply_penalties( logits, idx_mapping, diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py index b8d2f0c5a..c2944fac1 100644 --- a/vllm/v1/worker/gpu/sample/sampler.py +++ b/vllm/v1/worker/gpu/sample/sampler.py @@ -104,10 +104,10 @@ class Sampler: logits = torch.empty_like(logits, dtype=torch.float32).copy_(logits) # Apply logit bias (e.g., allowed_token_ids, min_tokens) in place. - self.logit_bias_state.apply_logit_bias(logits, idx_mapping, pos) + self.logit_bias_state.apply_logit_bias(logits, idx_mapping, idx_mapping_np, pos) # Apply penalties in place. - self.penalties_state.apply_penalties(logits, idx_mapping) + self.penalties_state.apply_penalties(logits, idx_mapping, idx_mapping_np) # Apply temperature in place. apply_temperature(logits, idx_mapping, self.sampling_states.temperature.gpu)