Optimize model execution with CUDA graph (#1926)

Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2023-12-16 21:12:08 -08:00
parent eed74a558f
commit 37ca558103
34 changed files with 557 additions and 254 deletions
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -17,7 +17,7 @@ from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
                           SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
                                               get_tokenizer)
-from vllm.utils import Counter
+from vllm.utils import Counter, get_open_port

 if ray:
    from ray.air.util.torch_dist import init_torch_dist_process_group
@@ -84,6 +84,7 @@ class LLMEngine:
            f"load_format={model_config.load_format}, "
            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
            f"quantization={model_config.quantization}, "
+            f"enforce_eager={model_config.enforce_eager}, "
            f"seed={model_config.seed})")
        # TODO(woosuk): Print more configs in debug mode.

@@ -189,6 +190,7 @@ class LLMEngine:
                          ))
        self._run_workers(
            "init_model",
+            cupy_port=get_open_port(),
            get_all_outputs=True,
        )
        self._run_workers(
@@ -232,6 +234,9 @@ class LLMEngine:

        # Initialize the cache.
        self._run_workers("init_cache_engine", cache_config=self.cache_config)
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        self._run_workers("warm_up_model")

    @classmethod
    def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":