Optimize model execution with CUDA graph (#1926)

Co-authored-by: Chen Shen <scv119@gmail.com>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
This commit is contained in:
Woosuk Kwon
2023-12-16 21:12:08 -08:00
committed by GitHub
parent eed74a558f
commit 37ca558103
34 changed files with 557 additions and 254 deletions

View File

@@ -17,7 +17,7 @@ from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
SequenceOutput, SequenceStatus)
from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
get_tokenizer)
from vllm.utils import Counter
from vllm.utils import Counter, get_open_port
if ray:
from ray.air.util.torch_dist import init_torch_dist_process_group
@@ -84,6 +84,7 @@ class LLMEngine:
f"load_format={model_config.load_format}, "
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
f"quantization={model_config.quantization}, "
f"enforce_eager={model_config.enforce_eager}, "
f"seed={model_config.seed})")
# TODO(woosuk): Print more configs in debug mode.
@@ -189,6 +190,7 @@ class LLMEngine:
))
self._run_workers(
"init_model",
cupy_port=get_open_port(),
get_all_outputs=True,
)
self._run_workers(
@@ -232,6 +234,9 @@ class LLMEngine:
# Initialize the cache.
self._run_workers("init_cache_engine", cache_config=self.cache_config)
# Warm up the model. This includes capturing the model into CUDA graph
# if enforce_eager is False.
self._run_workers("warm_up_model")
@classmethod
def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":