Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
182 lines
6.3 KiB
Python
182 lines
6.3 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM
|
|
via HTTP API, with IPC-based weight syncing APIs.
|
|
|
|
Unlike rlhf_nccl.py which uses NCCL and can use separate GPUs, this script
|
|
uses CUDA IPC which requires the training model and vLLM server to be on the
|
|
same GPU. Memory must be carefully managed to fit both models.
|
|
|
|
Unlike rlhf.py which creates a vLLM instance programmatically, this script
|
|
assumes you have already started a vLLM server using `vllm serve`. It uses:
|
|
- OpenAI-compatible API for inference requests
|
|
- HTTP endpoints for weight transfer control plane
|
|
- CUDA IPC for actual weight data transfer
|
|
|
|
Prerequisites:
|
|
Start a vLLM server with weight transfer enabled and reduced GPU memory
|
|
utilization to leave room for the training model:
|
|
|
|
$ VLLM_SERVER_DEV_MODE=1 VLLM_ALLOW_INSECURE_SERIALIZATION=1 \
|
|
vllm serve facebook/opt-125m --enforce-eager \
|
|
--weight-transfer-config '{"backend": "ipc"}' \
|
|
--load-format dummy \
|
|
--gpu-memory-utilization 0.5
|
|
|
|
Then run this script:
|
|
|
|
$ python rlhf_http_ipc.py
|
|
|
|
The example performs the following steps:
|
|
|
|
* Load the training model on GPU 0 (same GPU as the vLLM server).
|
|
* Generate text using the vLLM server via OpenAI-compatible API. The output
|
|
is expected to be nonsense because the server is initialized with dummy weights.
|
|
* Initialize weight transfer via HTTP endpoint (no-op for IPC).
|
|
* Broadcast the real weights from the training model to the vLLM server
|
|
using CUDA IPC handles.
|
|
* Generate text again to show normal output after the weight update.
|
|
"""
|
|
|
|
import os
|
|
|
|
import requests
|
|
import torch
|
|
from openai import OpenAI
|
|
from transformers import AutoModelForCausalLM
|
|
|
|
from vllm.distributed.weight_transfer.ipc_engine import (
|
|
IPCTrainerSendWeightsArgs,
|
|
IPCWeightTransferEngine,
|
|
)
|
|
|
|
BASE_URL = "http://localhost:8000"
|
|
MODEL_NAME = "facebook/opt-125m"
|
|
|
|
# Enable insecure serialization for IPC handle serialization
|
|
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
|
|
|
|
|
|
def generate_completions(client: OpenAI, model: str, prompts: list[str]) -> list[str]:
|
|
"""Generate completions using the OpenAI-compatible API."""
|
|
results = []
|
|
for prompt in prompts:
|
|
response = client.completions.create(
|
|
model=model,
|
|
prompt=prompt,
|
|
max_tokens=32,
|
|
temperature=0,
|
|
)
|
|
results.append(response.choices[0].text)
|
|
return results
|
|
|
|
|
|
def init_weight_transfer_engine(base_url: str) -> None:
|
|
"""Initialize weight transfer via HTTP endpoint (no-op for IPC)."""
|
|
url = f"{base_url}/init_weight_transfer_engine"
|
|
payload = {"init_info": dict()}
|
|
response = requests.post(url, json=payload, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
|
|
def pause_generation(base_url: str) -> None:
|
|
"""Pause generation via HTTP endpoint."""
|
|
url = f"{base_url}/pause"
|
|
response = requests.post(url, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
|
|
def resume_generation(base_url: str) -> None:
|
|
"""Resume generation via HTTP endpoint."""
|
|
url = f"{base_url}/resume"
|
|
response = requests.post(url, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
|
|
def get_world_size(base_url: str) -> int:
|
|
"""Get world size from the vLLM server."""
|
|
url = f"{base_url}/get_world_size"
|
|
response = requests.get(url, timeout=10)
|
|
response.raise_for_status()
|
|
return response.json()["world_size"]
|
|
|
|
|
|
def main():
|
|
# IPC requires the training model to be on the same GPU as the vLLM server
|
|
# The server should be started on GPU 0 with reduced memory utilization
|
|
device = "cuda:0"
|
|
torch.accelerator.set_device_index(device)
|
|
|
|
# Load the training model on the same GPU as the server
|
|
# Use bfloat16 to reduce memory footprint
|
|
print(f"Loading training model: {MODEL_NAME} on {device}")
|
|
print(
|
|
"Note: Ensure the vLLM server was started with --gpu-memory-utilization 0.5 "
|
|
"or lower to leave room for the training model."
|
|
)
|
|
train_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.bfloat16)
|
|
train_model.to(device)
|
|
train_model.eval() # Set to eval mode to save memory
|
|
|
|
# Create OpenAI client pointing to the vLLM server
|
|
client = OpenAI(
|
|
base_url=f"{BASE_URL}/v1",
|
|
api_key="EMPTY", # vLLM doesn't require an API key by default
|
|
)
|
|
|
|
# Test prompts
|
|
prompts = [
|
|
"Hello, my name is",
|
|
"The president of the United States is",
|
|
"The capital of France is",
|
|
"The future of AI is",
|
|
]
|
|
|
|
# Generate text before weight update. The output is expected to be nonsense
|
|
# because the server is initialized with dummy weights.
|
|
print("-" * 50)
|
|
print("Generating text BEFORE weight update (expect nonsense):")
|
|
print("-" * 50)
|
|
outputs = generate_completions(client, MODEL_NAME, prompts)
|
|
for prompt, generated_text in zip(prompts, outputs):
|
|
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
|
print("-" * 50)
|
|
|
|
print("Initializing weight transfer (IPC backend)...")
|
|
|
|
# Initialize weight transfer on vLLM server (no-op for IPC, but still required)
|
|
init_weight_transfer_engine(BASE_URL)
|
|
|
|
# Pause generation before weight sync
|
|
pause_generation(BASE_URL)
|
|
|
|
# Broadcast weights via IPC handles using HTTP mode
|
|
print("Broadcasting weights via CUDA IPC (HTTP)...")
|
|
trainer_args = IPCTrainerSendWeightsArgs(mode="http", url=BASE_URL)
|
|
IPCWeightTransferEngine.trainer_send_weights(
|
|
iterator=train_model.named_parameters(),
|
|
trainer_args=trainer_args,
|
|
)
|
|
|
|
# Resume generation after weight sync
|
|
resume_generation(BASE_URL)
|
|
|
|
# Generate text after weight update. The output is expected to be normal
|
|
# because the real weights are now loaded.
|
|
print("-" * 50)
|
|
print("Generating text AFTER weight update:")
|
|
print("-" * 50)
|
|
outputs_updated = generate_completions(client, MODEL_NAME, prompts)
|
|
for prompt, generated_text in zip(prompts, outputs_updated):
|
|
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
|
print("-" * 50)
|
|
|
|
# Note: The training model and IPC handles remain in memory.
|
|
# In a real RLHF training loop, you would update the training model
|
|
# and create new IPC handles for each weight update.
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|