Signed-off-by: hao-aaron <ahao@anyscale.com> Signed-off-by: Aaron Hao <ahao@anyscale.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com>
150 lines
4.7 KiB
Python
150 lines
4.7 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray,
|
|
with IPC-based weight syncing APIs
|
|
|
|
The script colocates the training and inference workloads onto the same GPU using Ray.
|
|
|
|
The example performs the following steps:
|
|
|
|
* Request a placement group of 1 GPU.
|
|
* Place the inference model on the above GPU using the placement group.
|
|
* Place and load the training model on the same GPU using the placement group.
|
|
* Generate text from a list of prompts using the inference engine.
|
|
* Update the weights of the training model and broadcast the updated weights
|
|
to the inference engine by using CUDA IPC handles. Note that
|
|
for demonstration purposes we simply zero out the weights.
|
|
|
|
This example assumes a single-node cluster with a single GPU,
|
|
but can be extended to multiple GPUs.
|
|
"""
|
|
|
|
import os
|
|
|
|
import ray
|
|
from ray.util.placement_group import placement_group
|
|
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
|
from transformers import AutoModelForCausalLM
|
|
|
|
from vllm import LLM, SamplingParams
|
|
from vllm.config import WeightTransferConfig
|
|
from vllm.distributed.weight_transfer.ipc_engine import (
|
|
IPCTrainerSendWeightsArgs,
|
|
IPCWeightTransferEngine,
|
|
)
|
|
|
|
|
|
class MyLLM(LLM):
|
|
"""Configure the vLLM worker for Ray placement group execution."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
# Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
|
|
# so that vLLM can manage its own device placement within the worker.
|
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
|
# Each worker uses 0.4 GPU so that two instances fit on the same GPU.
|
|
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
|
|
os.environ["VLLM_RAY_BUNDLE_INDICES"] = "0"
|
|
# needed for ipc handle serialization
|
|
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
# Load the OPT-125M model onto GPU 0 for the training workload.
|
|
|
|
MODEL_NAME = "facebook/opt-125m"
|
|
|
|
|
|
@ray.remote
|
|
class TrainModel:
|
|
def __init__(self, llm_handle: ray.actor.ActorHandle):
|
|
self.train_model = AutoModelForCausalLM.from_pretrained(
|
|
MODEL_NAME,
|
|
)
|
|
self.train_model.to("cuda:0")
|
|
self.llm_handle = llm_handle
|
|
|
|
def init_weight_transfer(self):
|
|
# IPC backend doesn't need initialization info
|
|
ray.get(
|
|
self.llm_handle.init_weight_transfer_engine.remote(dict(init_info=dict()))
|
|
)
|
|
|
|
def broadcast_weights(self, llm_handle: ray.actor.ActorHandle):
|
|
"""Broadcast weights to the inference engine using IPC."""
|
|
self.llm_handle = llm_handle
|
|
trainer_args = IPCTrainerSendWeightsArgs(mode="ray", llm_handle=llm_handle)
|
|
IPCWeightTransferEngine.trainer_send_weights(
|
|
iterator=self.train_model.named_parameters(),
|
|
trainer_args=trainer_args,
|
|
)
|
|
|
|
|
|
ray.init()
|
|
|
|
pg_colocate = placement_group([{"GPU": 1, "CPU": 0}])
|
|
ray.get(pg_colocate.ready())
|
|
|
|
|
|
llm = ray.remote(
|
|
num_cpus=0,
|
|
num_gpus=0,
|
|
scheduling_strategy=PlacementGroupSchedulingStrategy(
|
|
placement_group=pg_colocate,
|
|
placement_group_capture_child_tasks=True,
|
|
),
|
|
)(MyLLM).remote(
|
|
model=MODEL_NAME,
|
|
enforce_eager=True,
|
|
tensor_parallel_size=1,
|
|
distributed_executor_backend="ray",
|
|
gpu_memory_utilization=0.7,
|
|
weight_transfer_config=WeightTransferConfig(backend="ipc"),
|
|
load_format="dummy",
|
|
)
|
|
|
|
train_model = TrainModel.options(
|
|
num_gpus=0.1,
|
|
num_cpus=0,
|
|
scheduling_strategy=PlacementGroupSchedulingStrategy(
|
|
placement_group=pg_colocate, placement_group_capture_child_tasks=True
|
|
),
|
|
).remote(llm)
|
|
|
|
|
|
# Generate text from the prompts.
|
|
prompts = [
|
|
"Hello, my name is",
|
|
"The president of the United States is",
|
|
"The capital of France is",
|
|
"The future of AI is",
|
|
]
|
|
|
|
sampling_params = SamplingParams(temperature=0)
|
|
|
|
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
|
|
|
|
print("-" * 50)
|
|
for output in outputs:
|
|
prompt = output.prompt
|
|
generated_text = output.outputs[0].text
|
|
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
|
print("-" * 50)
|
|
|
|
ray.get(llm.sleep.remote(level=0))
|
|
|
|
ray.get(train_model.init_weight_transfer.remote())
|
|
# Synchronize the updated weights to the inference engine using batched API.
|
|
ray.get(train_model.broadcast_weights.remote(llm))
|
|
|
|
ray.get(llm.wake_up.remote(tags=["scheduling"]))
|
|
|
|
# Generate text with the updated model.
|
|
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
|
|
print("-" * 50)
|
|
for output in outputs_updated:
|
|
prompt = output.prompt
|
|
generated_text = output.outputs[0].text
|
|
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
|
print("-" * 50)
|