[V1][core] Implement pipeline parallel on Ray (#12996)
This commit is contained in:
@@ -16,7 +16,7 @@ from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.utils import get_exception_traceback, zmq_socket_ctx
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
from vllm.v1.core.scheduler import Scheduler
|
||||
from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
|
||||
EngineCoreRequestType)
|
||||
@@ -73,20 +73,25 @@ class EngineCore:
|
||||
start = time.time()
|
||||
|
||||
# Get all kv cache needed by the model
|
||||
kv_cache_spec = self.model_executor.get_kv_cache_spec()
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
|
||||
# Profiles the peak memory usage of the model to determine how much
|
||||
# memory can be allocated for kv cache.
|
||||
availble_gpu_memory = self.model_executor.determine_available_memory()
|
||||
available_gpu_memory = self.model_executor.determine_available_memory()
|
||||
|
||||
# Get the kv cache tensor size
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
availble_gpu_memory)
|
||||
num_gpu_blocks = kv_cache_config.num_blocks
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
|
||||
available_gpu_memory)
|
||||
num_gpu_blocks_set = set(config.num_blocks
|
||||
for config in kv_cache_configs)
|
||||
assert len(num_gpu_blocks_set) == 1, (
|
||||
f"num_gpu_blocks need to be the same across workers, "
|
||||
f"but they are different: {num_gpu_blocks_set}")
|
||||
num_gpu_blocks = num_gpu_blocks_set.pop()
|
||||
num_cpu_blocks = 0
|
||||
|
||||
# Initialize kv cache and warmup the execution
|
||||
self.model_executor.initialize(kv_cache_config)
|
||||
self.model_executor.initialize(kv_cache_configs)
|
||||
|
||||
elapsed = time.time() - start
|
||||
logger.info(("init engine (profile, create kv cache, "
|
||||
|
||||
Reference in New Issue
Block a user