[V1] Refactor num_computed_tokens logic (#15307)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-26 21:54:36 -07:00
parent fb22be5817
commit 54aa619459
5 changed files with 106 additions and 57 deletions
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -107,14 +107,33 @@ class RejectionSampler(nn.Module):
    @staticmethod
    def parse_output(
        output_token_ids: torch.Tensor,
+        ignored_req_idxs: list[int],
        vocab_size: int,
    ) -> list[list[int]]:
+        """Parse the output of the rejection sampler.
+
+        Args:
+            output_token_ids: The sampled token IDs in shape
+                [batch_size, max_spec_len + 1]. The rejected tokens are
+                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
+                and will be filtered out in this function.
+            ignored_req_idxs: The indices of the requests that should not be
+                sampled. This is usually because the request is still in the
+                prefill phase.
+            vocab_size: The size of the vocabulary.
+
+        Returns:
+            A list of lists of token IDs.
+        """
        output_token_ids_np = output_token_ids.cpu().numpy()
        # Create mask for valid tokens.
        valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
                      (output_token_ids_np < vocab_size))
+
+        ignored_req_idx_set = set(ignored_req_idxs)
        outputs = [
            row[valid_mask[i]].tolist()
+            if i not in ignored_req_idx_set else []
            for i, row in enumerate(output_token_ids_np)
        ]
        return outputs