[Model Runner V2] Use pinned memory for write_contents (#34222)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
This commit is contained in:
Woosuk Kwon
2026-02-10 08:55:22 -08:00
committed by GitHub
parent f84a2a8f31
commit a2443de5fa

View File

@@ -7,9 +7,11 @@ import numpy as np
import torch
from vllm.triton_utils import tl, triton
from vllm.utils.math_utils import next_power_of_2
from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor
from vllm.utils.torch_utils import (
async_tensor_h2d,
get_accelerator_view_from_cpu_tensor,
)
def async_copy_to_gpu(
@@ -117,6 +119,7 @@ class StagedWriteTensor:
)
self.num_rows = size if isinstance(size, int) else size[0]
self.dtype = dtype
self.device = device
self.max_concurrency = max_concurrency
if not uva_instead_of_gpu:
@@ -137,8 +140,6 @@ class StagedWriteTensor:
self.write_indices = new_buffer(self.num_rows, dtype=torch.int32)
self.write_starts = new_buffer(self.num_rows, dtype=torch.int32)
init_size = next_power_of_2(self.num_rows)
self.write_contents = new_buffer(init_size, dtype=dtype)
self.write_cu_lens = new_buffer(self.num_rows, dtype=torch.int32)
def stage_write(
@@ -170,21 +171,9 @@ class StagedWriteTensor:
cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens)
# Special handling for write_contents
diff_len = len(self._staged_write_contents)
assert isinstance(self.write_contents.size, int)
if diff_len > self.write_contents.size:
# Re-allocate a larger buffer for the write_contents
new_size = next_power_of_2(diff_len)
self.write_contents = UvaBufferPool(
new_size, dtype=self.dtype, max_concurrency=self.max_concurrency
)
# NOTE(woosuk): Since the previous write_contents buffer is released,
# we perform a synchronization here to ensure that all data transfers
# involving the old buffer have finished before allocating a new one.
# This prevents potential race conditions. The slight overhead is
# negligible because the reallocations are infrequent in practice.
torch.cuda.synchronize()
contents_uva = self.write_contents.copy_to_uva(self._staged_write_contents)
write_contents = async_tensor_h2d(
self._staged_write_contents, self.dtype, self.device, pin_memory=True
)
# Write diffs to the GPU buffer
_apply_write_kernel[(n,)](
@@ -192,7 +181,7 @@ class StagedWriteTensor:
self.gpu.stride(0),
indices_uva,
starts_uva,
contents_uva,
write_contents,
cu_lens_uva,
BLOCK_SIZE=1024,
)