[V1][Sampler] Avoid an operation during temperature application (#13587)

2025-02-20 22:05:56 -08:00
parent a30c093502
commit 31aa045c11
4 changed files with 18 additions and 10 deletions
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -191,11 +191,13 @@ def bind_kv_cache(


 def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
-               length: int) -> None:
+               length: int) -> torch.Tensor:
    """
    Copy the first length elements of a tensor into another tensor in a
    non-blocking manner.

    Used to copy pinned CPU tensor data to pre-allocated GPU tensors.
+
+    Returns the sliced target tensor.
    """
-    to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
+    return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)