From 40b2f1c3d9c1dbcec185e8b6911fd273524f5b88 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 19 Feb 2026 16:05:37 -0800
Subject: [PATCH] [Model Runner V2] Minor CPU optimizations (#34856)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
---
 .../device_communicators/shm_broadcast.py     |  4 ++--
 vllm/v1/worker/gpu/async_utils.py             | 19 ++++++++++++++++---
 vllm/v1/worker/gpu/buffer_utils.py            |  8 +++-----
 vllm/v1/worker/gpu/model_runner.py            |  7 +++++++
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index ef5f74c1e..ac46a5667 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -513,8 +513,8 @@ class MessageQueue:
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
-        while True:
-            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+        with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+            while True:
                 # Memory fence ensures we see the latest writes from the writer.
                 # Without this, we may read stale flags from our CPU cache
                 # and spin indefinitely even though writer has updated them.
diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index afcfa8dfb..e628e38bd 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
 
 import numpy as np
 import torch
@@ -14,6 +15,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
         model_runner_output: ModelRunnerOutput,
         sampler_output: SamplerOutput,
         num_sampled_tokens: torch.Tensor,
+        main_stream: torch.cuda.Stream,
         copy_stream: torch.cuda.Stream,
         copy_event: torch.cuda.Event,
     ):
@@ -25,9 +27,8 @@ class AsyncOutput(AsyncModelRunnerOutput):
         self.num_sampled_tokens = num_sampled_tokens
         self.copy_event = copy_event
 
-        default_stream = torch.cuda.current_stream()
-        with torch.cuda.stream(copy_stream):
-            copy_stream.wait_stream(default_stream)
+        with stream(copy_stream, main_stream):
+            copy_stream.wait_stream(main_stream)
 
             self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
             self.logprobs_tensors: LogprobsTensors | None = None
@@ -71,3 +72,15 @@ class AsyncOutput(AsyncModelRunnerOutput):
 
 def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
     return x.to("cpu", non_blocking=True).numpy()
+
+
+@contextlib.contextmanager
+def stream(to_stream: torch.cuda.Stream, from_stream: torch.cuda.Stream):
+    """Lightweight version of torch.cuda.stream() context manager which
+    avoids current_stream and device lookups.
+    """
+    try:
+        torch.cuda.set_stream(to_stream)
+        yield
+    finally:
+        torch.cuda.set_stream(from_stream)
diff --git a/vllm/v1/worker/gpu/buffer_utils.py b/vllm/v1/worker/gpu/buffer_utils.py
index d2cb20186..ad910933a 100644
--- a/vllm/v1/worker/gpu/buffer_utils.py
+++ b/vllm/v1/worker/gpu/buffer_utils.py
@@ -22,7 +22,6 @@ def async_copy_to_gpu(
     if isinstance(x, np.ndarray):
         x = torch.from_numpy(x)
     assert x.is_cpu
-    assert not x.is_pinned()
 
     if out is None:
         assert device is not None
@@ -30,6 +29,8 @@ def async_copy_to_gpu(
 
     # CPU-to-CPU copy
     tmp = x.pin_memory()
+    assert tmp is not x
+
     # CPU-to-GPU copy
     return out.copy_(tmp, non_blocking=True)
 
@@ -75,11 +76,8 @@ class UvaBufferPool:
         out: torch.Tensor | None = None,
     ) -> torch.Tensor:
         uva = self.copy_to_uva(x)
-        if out is None:
-            # CPU-to-GPU copy
-            return uva.clone()
         # CPU-to-GPU copy
-        return out.copy_(uva, non_blocking=True)
+        return uva.clone() if out is None else out.copy_(uva, non_blocking=True)
 
 
 class UvaBackedTensor:
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index cbae001c2..57d258229 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
 import gc
 import time
 from copy import deepcopy
@@ -239,6 +240,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
     def get_model(self) -> nn.Module:
         return self.model
 
+    @functools.cached_property
+    def main_stream(self) -> torch.cuda.Stream:
+        # Cache the default CUDA stream to avoid lookup overhead.
+        return torch.cuda.current_stream(self.device)
+
     def get_kv_cache_spec(self):
         return get_kv_cache_spec(self.vllm_config)
 
@@ -1065,6 +1071,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             model_runner_output=model_runner_output,
             sampler_output=sampler_output,
             num_sampled_tokens=num_sampled,
+            main_stream=self.main_stream,
             copy_stream=self.output_copy_stream,
             copy_event=self.output_copy_event,
         )