[Model Runner V2] Support num NaNs in logits (#30187)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-12-09 10:00:49 -08:00
parent 9e6562a3f6
commit d471b2aff0
7 changed files with 89 additions and 26 deletions
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager

+import numpy as np
 import torch

 from vllm.v1.outputs import (
    AsyncModelRunnerOutput,
    LogprobsTensors,
    ModelRunnerOutput,
-    SamplerOutput,
 )
+from vllm.v1.worker.gpu.sample.output import SamplerOutput


 class AsyncOutput(AsyncModelRunnerOutput):
@@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
        with torch.cuda.stream(self.copy_stream):
            self.copy_stream.wait_stream(default_stream)

-            # NOTE(woosuk): We must ensure that CPU tensors are not freed
-            # before the device-to-host copy is fully completed. For instance,
-            # operations like
-            # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
-            # are unsafe because the underlying CPU tensor can be prematurely freed and
-            # reused by other tensors before the asynchronous copy finishes, potentially
-            # causing race conditions. To prevent this, we delay freeing by holding
-            # references until the copy event signals completion.
-            # Likewise, we also need to keep the reference to the GPU tensors.
-            # This is done by keeping the reference to sampler_output and
-            # model_runner_output.
-            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
-                "cpu", non_blocking=True
-            )
+            self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
            if sampler_output.logprobs_tensors is not None:
                self.logprobs_tensors: LogprobsTensors | None = (
                    sampler_output.logprobs_tensors.to_cpu_nonblocking()
                )
            else:
                self.logprobs_tensors = None
-            self.num_sampled_tokens_cpu = num_sampled_tokens.to(
-                "cpu", non_blocking=True
-            )
+            if sampler_output.num_nans is not None:
+                self.num_nans = async_copy_to_np(sampler_output.num_nans)
+            else:
+                self.num_nans = None
+            self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens)
            self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
            if self.model_runner_output.prompt_logprobs_dict:
                for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput):

    def get_output(self) -> ModelRunnerOutput:
        self.copy_event.synchronize()
-        num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()

        # NOTE(woosuk): The following code is to ensure compatibility with
        # the existing model runner.
@@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
        # rather than Python lists.
        sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
        num_reqs = len(sampled_token_ids)
+        num_sampled_tokens = self.num_sampled_tokens_np.tolist()
        for i in range(num_reqs):
-            del sampled_token_ids[i][num_sampled_tokens_np[i] :]
+            del sampled_token_ids[i][num_sampled_tokens[i] :]
        self.model_runner_output.sampled_token_ids = sampled_token_ids

+        if self.num_nans is not None:
+            num_nans = self.num_nans.tolist()
+            self.model_runner_output.num_nans_in_logits = {
+                req_id: num_nans[i]
+                for i, req_id in enumerate(self.model_runner_output.req_ids)
+            }
+
        if self.logprobs_tensors is not None:
            self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
        self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
@@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None):
    finally:
        if event is not None:
            event.record()
+
+
+def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
+    return x.to("cpu", non_blocking=True).numpy()