[V1][Sampler] Avoid an operation during temperature application (#13587)

This commit is contained in:
Nick Hill
2025-02-20 22:05:56 -08:00
committed by GitHub
parent a30c093502
commit 31aa045c11
4 changed files with 18 additions and 10 deletions

View File

@@ -191,11 +191,13 @@ def bind_kv_cache(
def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
length: int) -> None:
length: int) -> torch.Tensor:
"""
Copy the first length elements of a tensor into another tensor in a
non-blocking manner.
Used to copy pinned CPU tensor data to pre-allocated GPU tensors.
Returns the sliced target tensor.
"""
to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)