[Redo] #26368 (#28771)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-11-15 14:47:41 +08:00
parent 6965ef436f
commit 98b4d389ed
15 changed files with 122 additions and 91 deletions
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1010,8 +1010,8 @@ class Scheduler(SchedulerInterface):
                continue

            req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids = (
-                sampled_token_ids[req_index] if sampled_token_ids else []
+            generated_token_ids: list[int] = (
+                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
            )

            scheduled_spec_token_ids = (
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
    # num_generated_tokens is the number of tokens
    # generated in the current step. It can be different for
    # each request due to speculative/jump decoding.
-    sampled_token_ids: list[list[int]]
+    sampled_token_ids: list[np.ndarray]

    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs, max_num_logprobs + 1]
@@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output(
    req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}

    # No tokens generated yet ⇒ one empty list per request
-    sampled_token_ids: list[list[int]] = [[0] for _ in req_ids]
+    sampled_token_ids: list[list[int]] = [np.array([0]) for _ in req_ids]

    # Pooler outputs are not available yet ⇒ use None placeholders
    pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,6 +3,7 @@

 from dataclasses import replace

+import numpy as np
 import torch
 import torch.nn as nn

@@ -204,7 +205,7 @@ class RejectionSampler(nn.Module):
    def parse_output(
        output_token_ids: torch.Tensor,
        vocab_size: int,
-    ) -> list[list[int]]:
+    ) -> list[np.ndarray]:
        """Parse the output of the rejection sampler.
        Args:
            output_token_ids: The sampled token IDs in shape
@@ -220,10 +221,7 @@ class RejectionSampler(nn.Module):
        valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
            output_token_ids_np < vocab_size
        )
-        outputs = [
-            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
-        ]
-        return outputs
+        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]

    def apply_logits_processors(
        self,
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -484,7 +484,7 @@ class EagleProposer:

    def prepare_next_token_ids_cpu(
        self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
        requests: dict[str, CachedRequestState],
        gpu_input_batch: InputBatch,
        num_scheduled_tokens: dict[str, int],
@@ -499,7 +499,7 @@ class EagleProposer:
        req_ids = gpu_input_batch.req_ids
        next_token_ids: list[int] = []
        for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids:
+            if token_ids.shape[0] > 0:
                # Common case.
                next_token_id = token_ids[-1]
            else:
@@ -510,10 +510,9 @@ class EagleProposer:
                seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                next_token_id = req_state.get_token_id(seq_len)
            next_token_ids.append(next_token_id)
-        next_token_ids = torch.tensor(
+        return torch.tensor(
            next_token_ids, dtype=torch.int32, device=self.input_ids.device
        )
-        return next_token_ids

    def prepare_next_token_ids_padded(
        self,
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -54,7 +54,7 @@ class NgramProposer:
        # Trigger Numba JIT compilation for N-gram proposer.
        # This usually takes less than 1 second.
        self.propose(
-            [[]] * 1024,
+            [np.array([])] * 1024,
            [""] * 1024,
            np.zeros(1024, dtype=np.int32),
            np.zeros((1024, self.max_model_len), dtype=np.int32),
@@ -131,7 +131,7 @@ class NgramProposer:

    def propose(
        self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
        req_ids: list[str],
        num_tokens_no_spec: np.ndarray,
        token_ids_cpu: np.ndarray,
@@ -140,7 +140,7 @@ class NgramProposer:
        # find which requests need ngram proposals
        valid_ngram_requests = []
        for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = len(sampled_ids)
+            num_sampled_ids = sampled_ids.shape[0]
            if not num_sampled_ids:
                # Skip speculative decoding.
                continue
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+
 from vllm.config import VllmConfig
 from vllm.v1.worker.gpu_input_batch import InputBatch

@@ -32,16 +34,16 @@ class SuffixDecodingProposer:
    def propose(
        self,
        input_batch: InputBatch,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
    ) -> list[list[int]]:
        """
        Propose speculative tokens for each request in the input batch. Suffix Decoding
        will speculate a dynamic number of tokens for each request every decoding step,
        so each entry in the returned list may have different lengths.
        """
-        draft_token_ids: list[list[int]] = []
+        draft_token_ids: list[np.ndarray] = []
        for i, sampled_ids in enumerate(sampled_token_ids):
-            if not sampled_ids:
+            if sampled_ids.shape[0] == 0:
                # Skip speculative decoding for partial prefills.
                draft_token_ids.append([])
                continue
@@ -70,7 +72,7 @@ class SuffixDecodingProposer:
                self.suffix_cache.start_request(req_id, prompt_token_ids)

            # Append the newly sampled ids to the suffix cache for this request.
-            self.suffix_cache.add_active_response(req_id, sampled_ids)
+            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())

            # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
            # we extract the pattern from the end of the input.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -216,9 +216,11 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
        del self._logprobs_tensors
        del self._sampled_token_ids

-        valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+        valid_sampled_token_ids: list[np.ndarray] = [
+            row for row in self.sampled_token_ids_cpu.numpy()
+        ]
        for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids[i] = np.array([])

        output = self._model_runner_output
        output.sampled_token_ids = valid_sampled_token_ids
@@ -2339,7 +2341,7 @@ class GPUModelRunner(
    ) -> tuple[
        dict[str, int],
        LogprobsLists | None,
-        list[list[int]],
+        list[np.ndarray],
        dict[str, LogprobsTensors | None],
        list[str],
        dict[str, int],
@@ -2365,6 +2367,7 @@ class GPUModelRunner(
        num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
        sampled_token_ids = sampler_output.sampled_token_ids
        invalid_req_indices = []
+        valid_sampled_token_ids: list[np.ndarray]
        if not self.use_async_scheduling:
            # Get the valid generated tokens.
            max_gen_len = sampled_token_ids.shape[-1]
@@ -2379,7 +2382,7 @@ class GPUModelRunner(
                )
            # Mask out the sampled tokens that should not be sampled.
            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
+                valid_sampled_token_ids[int(i)] = np.array([])
        else:
            valid_sampled_token_ids = []
            invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2407,19 +2410,24 @@ class GPUModelRunner(
            [0] if spec_decode_metadata and logprobs_tensors else None
        )
        for req_idx in range(num_sampled_tokens):
+            sampled_ids: np.ndarray | None
            if self.use_async_scheduling:
-                sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
+                sampled_ids = (
+                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
+                )
            else:
                sampled_ids = valid_sampled_token_ids[req_idx]

-            num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
+            num_sampled_ids: int = (
+                sampled_ids.shape[0] if sampled_ids is not None else 0
+            )

            if cu_num_accepted_tokens is not None:
                cu_num_accepted_tokens.append(
                    cu_num_accepted_tokens[-1] + num_sampled_ids
                )

-            if not sampled_ids:
+            if sampled_ids is None or num_sampled_ids == 0:
                continue

            start_idx = self.input_batch.num_tokens_no_spec[req_idx]
@@ -2761,7 +2769,9 @@ class GPUModelRunner(
        with record_function_or_nullcontext("gpu_model_runner: sample"):
            sampler_output = self._sample(logits, spec_decode_metadata)

-        def propose_draft_token_ids(sampled_token_ids):
+        def propose_draft_token_ids(
+            sampled_token_ids: torch.Tensor | list[np.ndarray],
+        ) -> None:
            assert spec_decode_common_attn_metadata is not None
            with record_function_or_nullcontext("gpu_model_runner: draft"):
                self._draft_token_ids = self.propose_draft_token_ids(
@@ -2883,14 +2893,14 @@ class GPUModelRunner(
    def propose_draft_token_ids(
        self,
        scheduler_output: "SchedulerOutput",
-        sampled_token_ids: torch.Tensor | list[list[int]],
+        sampled_token_ids: torch.Tensor | list[np.ndarray],
        sampling_metadata: SamplingMetadata,
        hidden_states: torch.Tensor,
        sample_hidden_states: torch.Tensor,
        aux_hidden_states: list[torch.Tensor] | None,
        spec_decode_metadata: SpecDecodeMetadata | None,
        common_attn_metadata: CommonAttentionMetadata,
-    ) -> list[list[int]] | torch.Tensor:
+    ) -> torch.Tensor | list[list[int]]:
        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
        if self.speculative_config.method == "ngram":
            assert isinstance(sampled_token_ids, list)
@@ -2922,7 +2932,7 @@ class GPUModelRunner(
                for num_draft, tokens in zip(
                    spec_decode_metadata.num_draft_tokens, sampled_token_ids
                ):
-                    indices.append(offset + len(tokens) - 1)
+                    indices.append(offset + tokens.shape[0] - 1)
                    offset += num_draft + 1
                indices = torch.tensor(indices, device=self.device)
                hidden_states = sample_hidden_states[indices]
@@ -4862,7 +4872,7 @@ class GPUModelRunner(

        return kv_cache_spec

-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
        # This is a short term mitigation for issue mentioned in
        # https://github.com/vllm-project/vllm/issues/22754.
        # `tolist` would trigger a cuda wise stream sync, which
@@ -4875,4 +4885,4 @@ class GPUModelRunner(
        pinned.copy_(sampled_token_ids, non_blocking=True)
        self.transfer_event.record()
        self.transfer_event.synchronize()
-        return pinned.tolist()
+        return [row for row in pinned.numpy()]
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1254,13 +1254,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

        max_gen_len = selected_token_ids.shape[-1]
        if max_gen_len == 1:
-            valid_sampled_token_ids = selected_token_ids.tolist()
+            valid_sampled_token_ids: list[np.ndarray] = [
+                row for row in selected_token_ids.numpy()
+            ]

            # Mask out the sampled tokens that should not be sampled.
            # TODO: Keep in sync with gpu_model_runner.py, in particular
            #       the "else" case here
            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
+                valid_sampled_token_ids[i] = np.array([])

            # Append sampled tokens
            for i, req_state, seq_len in request_seq_lens:
@@ -1273,7 +1275,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            valid_mask = selected_token_ids != INVALID_TOKEN_ID
            gen_lens = valid_mask.sum(dim=1).tolist()
            valid_sampled_token_ids = [
-                seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
+                seq.numpy() for seq in selected_token_ids[valid_mask].split(gen_lens)
            ]
            self.input_batch.num_tokens[:num_reqs] += gen_lens
            for i, req_state, seq_len in request_seq_lens: