[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)
This commit is contained in:
27
vllm/executor/msgspec_utils.py
Normal file
27
vllm/executor/msgspec_utils.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from array import array
|
||||
from typing import Any, Type
|
||||
|
||||
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
|
||||
|
||||
|
||||
def encode_hook(obj: Any) -> Any:
|
||||
"""Custom msgspec enc hook that supports array types.
|
||||
|
||||
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
|
||||
"""
|
||||
if isinstance(obj, array):
|
||||
assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
|
||||
f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
|
||||
f"Given array has a type code of {obj.typecode}.")
|
||||
return obj.tobytes()
|
||||
|
||||
|
||||
def decode_hook(type: Type, obj: Any) -> Any:
|
||||
"""Custom msgspec dec hook that supports array types.
|
||||
|
||||
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
|
||||
"""
|
||||
if type is array:
|
||||
deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
|
||||
deserialized.frombytes(obj)
|
||||
return deserialized
|
||||
@@ -4,9 +4,12 @@ from collections import defaultdict
|
||||
from itertools import islice, repeat
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
|
||||
import msgspec
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
|
||||
DistributedGPUExecutor, DistributedGPUExecutorAsync)
|
||||
from vllm.executor.msgspec_utils import encode_hook
|
||||
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
||||
@@ -60,6 +63,10 @@ class RayGPUExecutor(DistributedGPUExecutor):
|
||||
# Create the parallel GPU workers.
|
||||
self._init_workers_ray(placement_group)
|
||||
|
||||
self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
self.output_decoder = msgspec.msgpack.Decoder(
|
||||
Optional[List[SamplerOutput]])
|
||||
|
||||
def shutdown(self) -> None:
|
||||
if hasattr(self, "forward_dag") and self.forward_dag is not None:
|
||||
self.forward_dag.teardown()
|
||||
@@ -123,6 +130,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
|
||||
ray_remote_kwargs)
|
||||
|
||||
logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
|
||||
|
||||
# Create the workers.
|
||||
driver_ip = get_ip()
|
||||
worker_wrapper_kwargs = self._get_worker_wrapper_args()
|
||||
@@ -304,8 +312,10 @@ class RayGPUExecutor(DistributedGPUExecutor):
|
||||
if self.forward_dag is None:
|
||||
self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
|
||||
|
||||
outputs = ray.get(self.forward_dag.execute(execute_model_req))
|
||||
return outputs[0]
|
||||
serialized_data = self.input_encoder.encode(execute_model_req)
|
||||
outputs = ray.get(self.forward_dag.execute(serialized_data))
|
||||
output = self.output_decoder.decode(outputs[0])
|
||||
return output
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
@@ -475,9 +485,10 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
|
||||
if self.forward_dag is None:
|
||||
self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
|
||||
|
||||
dag_future = await self.forward_dag.execute_async(execute_model_req)
|
||||
serialized_data = self.input_encoder.encode(execute_model_req)
|
||||
dag_future = await self.forward_dag.execute_async(serialized_data)
|
||||
outputs = await dag_future
|
||||
return outputs[0]
|
||||
return self.output_decoder.decode(outputs[0])
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import msgspec
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.executor.msgspec_utils import decode_hook, encode_hook
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
|
||||
@@ -24,6 +27,10 @@ try:
|
||||
# that thread.
|
||||
self.compiled_dag_cuda_device_set = False
|
||||
|
||||
self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
|
||||
dec_hook=decode_hook)
|
||||
self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
|
||||
def get_node_ip(self) -> str:
|
||||
return get_ip()
|
||||
|
||||
@@ -33,16 +40,26 @@ try:
|
||||
return node_id, gpu_ids
|
||||
|
||||
def execute_model_spmd(
|
||||
self, req_or_tuple: Union[ExecuteModelRequest,
|
||||
Tuple[ExecuteModelRequest,
|
||||
IntermediateTensors]]):
|
||||
self, req_or_tuple: Union[bytes,
|
||||
Tuple[bytes,
|
||||
Optional[IntermediateTensors]]]
|
||||
) -> bytes:
|
||||
"""Execute model in SPMD fashion: used only when SPMD worker and
|
||||
compiled DAG are both enabled.
|
||||
|
||||
Args:
|
||||
req_or_tuple: The request to execute the model, or a tuple
|
||||
containing the request and intermediate tensors.
|
||||
req_or_tuple: A request or a tuple containing the
|
||||
request and intermediate tensors. Intermediate tensors are
|
||||
None unless if it is provided because it is > 0 pipeline
|
||||
stage. The request is serialized by msgspec.
|
||||
"""
|
||||
if isinstance(req_or_tuple, bytes):
|
||||
serialized_req, intermediate_tensors = req_or_tuple, None
|
||||
else:
|
||||
serialized_req, intermediate_tensors = req_or_tuple
|
||||
|
||||
execute_model_req = self.input_decoder.decode(serialized_req)
|
||||
|
||||
# TODO(swang): This is needed right now because Ray aDAG executes
|
||||
# on a background thread, so we need to reset torch's current
|
||||
# device.
|
||||
@@ -51,16 +68,14 @@ try:
|
||||
torch.cuda.set_device(self.worker.device)
|
||||
self.compiled_dag_cuda_device_set = True
|
||||
|
||||
if isinstance(req_or_tuple, tuple):
|
||||
execute_model_req, intermediate_tensors = req_or_tuple
|
||||
else:
|
||||
execute_model_req = req_or_tuple
|
||||
intermediate_tensors = None
|
||||
|
||||
output = self.worker._execute_model_spmd(execute_model_req,
|
||||
intermediate_tensors)
|
||||
# Pipeline model request and output to the next pipeline stage.
|
||||
if isinstance(output, IntermediateTensors):
|
||||
return execute_model_req, output
|
||||
output = serialized_req, output
|
||||
else:
|
||||
output = self.output_encoder.encode(output)
|
||||
|
||||
return output
|
||||
|
||||
ray_import_err = None
|
||||
|
||||
Reference in New Issue
Block a user