vllm/examples/offline_inference/new_weight_syncing/rlhf_ipc.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray,
with IPC-based weight syncing APIs

The script colocates the training and inference workloads onto the same GPU using Ray.

The example performs the following steps:

* Request a placement group of 1 GPU.
* Place the inference model on the above GPU using the placement group.
* Place and load the training model on the same GPU using the placement group.
* Generate text from a list of prompts using the inference engine.
* Update the weights of the training model and broadcast the updated weights
  to the inference engine by using CUDA IPC handles. Note that
  for demonstration purposes we simply zero out the weights.

This example assumes a single-node cluster with a single GPU,
but can be extended to multiple GPUs.
"""

import os

import ray
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from transformers import AutoModelForCausalLM

from vllm import LLM, SamplingParams
from vllm.config import WeightTransferConfig
from vllm.distributed.weight_transfer.ipc_engine import (
    IPCTrainerSendWeightsArgs,
    IPCWeightTransferEngine,
)


class MyLLM(LLM):
    """Configure the vLLM worker for Ray placement group execution."""

    def __init__(self, *args, **kwargs):
        # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
        # so that vLLM can manage its own device placement within the worker.
        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        # Each worker uses 0.4 GPU so that two instances fit on the same GPU.
        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
        os.environ["VLLM_RAY_BUNDLE_INDICES"] = "0"
        # needed for ipc handle serialization
        os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
        super().__init__(*args, **kwargs)


# Load the OPT-125M model onto GPU 0 for the training workload.

MODEL_NAME = "facebook/opt-125m"


@ray.remote
class TrainModel:
    def __init__(self, llm_handle: ray.actor.ActorHandle):
        self.train_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
        )
        self.train_model.to("cuda:0")
        self.llm_handle = llm_handle

    def init_weight_transfer(self):
        # IPC backend doesn't need initialization info
        ray.get(
            self.llm_handle.init_weight_transfer_engine.remote(dict(init_info=dict()))
        )

    def broadcast_weights(self, llm_handle: ray.actor.ActorHandle):
        """Broadcast weights to the inference engine using IPC."""
        self.llm_handle = llm_handle
        trainer_args = IPCTrainerSendWeightsArgs(mode="ray", llm_handle=llm_handle)
        IPCWeightTransferEngine.trainer_send_weights(
            iterator=self.train_model.named_parameters(),
            trainer_args=trainer_args,
        )


ray.init()

pg_colocate = placement_group([{"GPU": 1, "CPU": 0}])
ray.get(pg_colocate.ready())


llm = ray.remote(
    num_cpus=0,
    num_gpus=0,
    scheduling_strategy=PlacementGroupSchedulingStrategy(
        placement_group=pg_colocate,
        placement_group_capture_child_tasks=True,
    ),
)(MyLLM).remote(
    model=MODEL_NAME,
    enforce_eager=True,
    tensor_parallel_size=1,
    distributed_executor_backend="ray",
    gpu_memory_utilization=0.7,
    weight_transfer_config=WeightTransferConfig(backend="ipc"),
    load_format="dummy",
)

train_model = TrainModel.options(
    num_gpus=0.1,
    num_cpus=0,
    scheduling_strategy=PlacementGroupSchedulingStrategy(
        placement_group=pg_colocate, placement_group_capture_child_tasks=True
    ),
).remote(llm)


# Generate text from the prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]

sampling_params = SamplingParams(temperature=0)

outputs = ray.get(llm.generate.remote(prompts, sampling_params))

print("-" * 50)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print("-" * 50)

ray.get(llm.sleep.remote(level=0))

ray.get(train_model.init_weight_transfer.remote())
# Synchronize the updated weights to the inference engine using batched API.
ray.get(train_model.broadcast_weights.remote(llm))

ray.get(llm.wake_up.remote(tags=["scheduling"]))

# Generate text with the updated model.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs_updated:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print("-" * 50)