fix: revert cast to cpu in MsgpackEncoder._encode_tensor to avoid hidden performance regressions (#25738)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-26 03:18:05 -05:00
parent 3edf87d25f
commit e84e0735c7
2 changed files with 6 additions and 1 deletions
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -278,6 +278,11 @@ class InputPreprocessor:
            raise ValueError(
                "prompt_embeds must be of shape (seq_len, hidden_size).")

+        # Tensors must be on CPU for serialization between processes
+        # in the MsgpackEncoder. Casting to CPU here ensures that there is no
+        # hidden device transfer in the critical path of generation.
+        prompt_embeds = prompt_embeds.cpu()
+
        return embeds_inputs(prompt_embeds=prompt_embeds,
                             cache_salt=parsed_content.get("cache_salt"))