[V1] Support LLM.apply_model (#18465)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-20 15:14:35 +08:00
parent be874c0201
commit 3d9a1d2de5
17 changed files with 194 additions and 169 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -522,9 +522,14 @@ class LLM:
        """
        Run a function directly on the model inside each worker,
        returning the result for each of them.
+
+        !!! warning
+            To reduce the overhead of data transfer, avoid returning large
+            arrays or tensors from this method. If you must return them,
+            make sure you move them to CPU first to avoid taking up additional
+            VRAM!
        """
-        executor = self.llm_engine.model_executor
-        return executor.apply_model(func)
+        return self.llm_engine.apply_model(func)

    def _get_beam_search_lora_requests(
        self,