[Model Runner V2] Use pinned memory for write_contents (#34222)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-02-10 08:55:22 -08:00
parent f84a2a8f31
commit a2443de5fa
1 changed files with 9 additions and 20 deletions
--- a/vllm/v1/worker/gpu/buffer_utils.py
+++ b/vllm/v1/worker/gpu/buffer_utils.py
@@ -7,9 +7,11 @@ import numpy as np
 import torch

 from vllm.triton_utils import tl, triton
-from vllm.utils.math_utils import next_power_of_2
 from vllm.utils.platform_utils import is_uva_available
-from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor
+from vllm.utils.torch_utils import (
+    async_tensor_h2d,
+    get_accelerator_view_from_cpu_tensor,
+)


 def async_copy_to_gpu(
@@ -117,6 +119,7 @@ class StagedWriteTensor:
            )
        self.num_rows = size if isinstance(size, int) else size[0]
        self.dtype = dtype
+        self.device = device
        self.max_concurrency = max_concurrency

        if not uva_instead_of_gpu:
@@ -137,8 +140,6 @@ class StagedWriteTensor:

        self.write_indices = new_buffer(self.num_rows, dtype=torch.int32)
        self.write_starts = new_buffer(self.num_rows, dtype=torch.int32)
-        init_size = next_power_of_2(self.num_rows)
-        self.write_contents = new_buffer(init_size, dtype=dtype)
        self.write_cu_lens = new_buffer(self.num_rows, dtype=torch.int32)

    def stage_write(
@@ -170,21 +171,9 @@ class StagedWriteTensor:
        cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens)

        # Special handling for write_contents
-        diff_len = len(self._staged_write_contents)
-        assert isinstance(self.write_contents.size, int)
-        if diff_len > self.write_contents.size:
-            # Re-allocate a larger buffer for the write_contents
-            new_size = next_power_of_2(diff_len)
-            self.write_contents = UvaBufferPool(
-                new_size, dtype=self.dtype, max_concurrency=self.max_concurrency
-            )
-            # NOTE(woosuk): Since the previous write_contents buffer is released,
-            # we perform a synchronization here to ensure that all data transfers
-            # involving the old buffer have finished before allocating a new one.
-            # This prevents potential race conditions. The slight overhead is
-            # negligible because the reallocations are infrequent in practice.
-            torch.cuda.synchronize()
-        contents_uva = self.write_contents.copy_to_uva(self._staged_write_contents)
+        write_contents = async_tensor_h2d(
+            self._staged_write_contents, self.dtype, self.device, pin_memory=True
+        )

        # Write diffs to the GPU buffer
        _apply_write_kernel[(n,)](
@@ -192,7 +181,7 @@ class StagedWriteTensor:
            self.gpu.stride(0),
            indices_uva,
            starts_uva,
-            contents_uva,
+            write_contents,
            cu_lens_uva,
            BLOCK_SIZE=1024,
        )