Optimize model execution with CUDA graph (#1926)
Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
This commit is contained in:
@@ -17,7 +17,7 @@ from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
|
||||
SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
|
||||
get_tokenizer)
|
||||
from vllm.utils import Counter
|
||||
from vllm.utils import Counter, get_open_port
|
||||
|
||||
if ray:
|
||||
from ray.air.util.torch_dist import init_torch_dist_process_group
|
||||
@@ -84,6 +84,7 @@ class LLMEngine:
|
||||
f"load_format={model_config.load_format}, "
|
||||
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
|
||||
f"quantization={model_config.quantization}, "
|
||||
f"enforce_eager={model_config.enforce_eager}, "
|
||||
f"seed={model_config.seed})")
|
||||
# TODO(woosuk): Print more configs in debug mode.
|
||||
|
||||
@@ -189,6 +190,7 @@ class LLMEngine:
|
||||
))
|
||||
self._run_workers(
|
||||
"init_model",
|
||||
cupy_port=get_open_port(),
|
||||
get_all_outputs=True,
|
||||
)
|
||||
self._run_workers(
|
||||
@@ -232,6 +234,9 @@ class LLMEngine:
|
||||
|
||||
# Initialize the cache.
|
||||
self._run_workers("init_cache_engine", cache_config=self.cache_config)
|
||||
# Warm up the model. This includes capturing the model into CUDA graph
|
||||
# if enforce_eager is False.
|
||||
self._run_workers("warm_up_model")
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
|
||||
|
||||
Reference in New Issue
Block a user