[V1][core] Implement pipeline parallel on Ray (#12996)

This commit is contained in:
Rui Qiao
2025-02-13 00:02:46 -08:00
committed by GitHub
parent 0ccd8769fb
commit 9605c1256e
7 changed files with 110 additions and 45 deletions

View File

@@ -35,7 +35,7 @@ try:
class RayWorkerWrapper(WorkerWrapperBase):
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
lazily initialized after Ray sets CUDA_VISIBLE_DEVICES."""
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
@@ -118,7 +118,14 @@ try:
) -> "ModelRunnerOutput":
self.setup_device_if_necessary()
assert self.worker is not None, "Worker is not initialized"
output = self.worker.model_runner.execute_model(scheduler_output)
if isinstance(scheduler_output, tuple):
scheduler_output, intermediate_tensors = scheduler_output
else:
scheduler_output, intermediate_tensors = scheduler_output, None
output = self.worker.model_runner.execute_model(
scheduler_output, intermediate_tensors)
if isinstance(output, IntermediateTensors):
output = scheduler_output, output
return output
def override_env_vars(self, vars: Dict[str, str]):