[Core] Implement disagg prefill by StatelessProcessGroup (#10502)
This PR provides initial support for single-node disaggregated prefill in 1P1D scenario. Signed-off-by: KuntaiDu <kuntai@uchicago.edu> Co-authored-by: ApostaC <yihua98@uchicago.edu> Co-authored-by: YaoJiayi <120040070@link.cuhk.edu.cn>
This commit is contained in:
119
tests/kv_transfer/disagg_test.py
Normal file
119
tests/kv_transfer/disagg_test.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from subprocess import Popen
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
|
||||
# Fixture to set up environment variables and teardown servers after tests
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def setup_servers():
|
||||
if torch.cuda.device_count() < 4:
|
||||
pytest.skip("Skipping test: fewer than 4 GPUs available")
|
||||
|
||||
# Set up environment variables
|
||||
VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
|
||||
shell=True).decode().strip()
|
||||
os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
|
||||
|
||||
# Start prefill instance
|
||||
prefill_cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"vllm.entrypoints.openai.api_server",
|
||||
"--model",
|
||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"--port",
|
||||
"8100",
|
||||
"--gpu-memory-utilization",
|
||||
"0.5",
|
||||
"--max-model-len",
|
||||
"1000",
|
||||
"--kv-transfer-config",
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\
|
||||
'"kv_rank":0,"kv_parallel_size":2}',
|
||||
]
|
||||
prefill_env = os.environ.copy()
|
||||
prefill_env["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
prefill_proc = Popen(prefill_cmd, env=prefill_env)
|
||||
|
||||
# Start decode instance
|
||||
decode_cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"vllm.entrypoints.openai.api_server",
|
||||
"--model",
|
||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"--port",
|
||||
"8200",
|
||||
"--gpu-memory-utilization",
|
||||
"0.5",
|
||||
"--max-model-len",
|
||||
"1000",
|
||||
"--kv-transfer-config",
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\
|
||||
'"kv_rank":1,"kv_parallel_size":2}',
|
||||
]
|
||||
decode_env = os.environ.copy()
|
||||
decode_env["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
decode_proc = Popen(decode_cmd, env=decode_env)
|
||||
|
||||
# Wait for servers to be ready
|
||||
assert wait_for_server(8100), "Prefill server did not start in time"
|
||||
assert wait_for_server(8200), "Decode server did not start in time"
|
||||
|
||||
# Yield to the test function and handle teardown after tests
|
||||
yield
|
||||
|
||||
# Cleanup: kill the processes
|
||||
prefill_proc.terminate()
|
||||
decode_proc.terminate()
|
||||
|
||||
# Additional cleanup if needed
|
||||
prefill_proc.wait()
|
||||
decode_proc.wait()
|
||||
|
||||
|
||||
# Helper function to wait for server
|
||||
def wait_for_server(port, timeout=240):
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
try:
|
||||
response = requests.get(f"http://localhost:{port}/v1/completions")
|
||||
if response.status_code in [200, 405]:
|
||||
return True
|
||||
except requests.ConnectionError:
|
||||
time.sleep(1)
|
||||
return False
|
||||
|
||||
|
||||
# Test function to send curl requests and validate responses
|
||||
@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
|
||||
def test_disaggregated_prefilling(prompt):
|
||||
# Send to prefill
|
||||
response = requests.post("http://localhost:8100/v1/completions",
|
||||
headers={"Content-Type": "application/json"},
|
||||
json={
|
||||
"model":
|
||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"prompt": prompt,
|
||||
"max_tokens": 1,
|
||||
"temperature": 0
|
||||
})
|
||||
assert response.status_code == 200
|
||||
|
||||
# Send to decode
|
||||
response = requests.post("http://localhost:8200/v1/completions",
|
||||
headers={"Content-Type": "application/json"},
|
||||
json={
|
||||
"model":
|
||||
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"prompt": prompt,
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
})
|
||||
assert response.status_code == 200
|
||||
64
tests/kv_transfer/module_test.py
Normal file
64
tests/kv_transfer/module_test.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
||||
def run_python_script(script_name, timeout):
|
||||
script_name = f'kv_transfer/{script_name}'
|
||||
try:
|
||||
# Start both processes asynchronously using Popen
|
||||
process0 = subprocess.Popen(
|
||||
[sys.executable, script_name],
|
||||
env={"RANK":
|
||||
"0"}, # Set the RANK environment variable for process 0
|
||||
stdout=sys.stdout, # Pipe stdout to current stdout
|
||||
stderr=sys.stderr, # Pipe stderr to current stderr
|
||||
)
|
||||
|
||||
process1 = subprocess.Popen(
|
||||
[sys.executable, script_name],
|
||||
env={"RANK":
|
||||
"1"}, # Set the RANK environment variable for process 1
|
||||
stdout=sys.stdout, # Pipe stdout to current stdout
|
||||
stderr=sys.stderr, # Pipe stderr to current stderr
|
||||
)
|
||||
|
||||
# Wait for both processes to complete, with a timeout
|
||||
process0.wait(timeout=timeout)
|
||||
process1.wait(timeout=timeout)
|
||||
|
||||
# Check the return status of both processes
|
||||
if process0.returncode != 0:
|
||||
pytest.fail(
|
||||
f"Test {script_name} failed for RANK=0, {process0.returncode}")
|
||||
if process1.returncode != 0:
|
||||
pytest.fail(
|
||||
f"Test {script_name} failed for RANK=1, {process1.returncode}")
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
# If either process times out, terminate both and fail the test
|
||||
process0.terminate()
|
||||
process1.terminate()
|
||||
pytest.fail(f"Test {script_name} timed out")
|
||||
except Exception as e:
|
||||
pytest.fail(f"Test {script_name} failed with error: {str(e)}")
|
||||
|
||||
|
||||
# Define the test cases using pytest's parametrize
|
||||
@pytest.mark.parametrize(
|
||||
"script_name,timeout",
|
||||
[
|
||||
("test_lookup_buffer.py",
|
||||
60), # Second test case with a 60-second timeout
|
||||
("test_send_recv.py", 120) # First test case with a 120-second timeout
|
||||
])
|
||||
def test_run_python_script(script_name, timeout):
|
||||
# Check the number of GPUs
|
||||
if torch.cuda.device_count() < 2:
|
||||
pytest.skip(
|
||||
f"Skipping test {script_name} because <2 GPUs are available")
|
||||
|
||||
# Run the test if there are at least 2 GPUs
|
||||
run_python_script(script_name, timeout)
|
||||
160
tests/kv_transfer/test_lookup_buffer.py
Normal file
160
tests/kv_transfer/test_lookup_buffer.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
|
||||
SimpleBuffer)
|
||||
from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
|
||||
|
||||
# TODO: the test depends on a lot of fields in the current implementation.
|
||||
# We should have standard interface instead direct field access
|
||||
|
||||
|
||||
def test_run(my_rank, buffer, device):
|
||||
|
||||
# buffer should be empty in the beginning
|
||||
if my_rank == 0:
|
||||
assert buffer.buffer_size == 0
|
||||
assert len(buffer.buffer) == 0
|
||||
|
||||
print("My rank: %d, device: %s" % (my_rank, device))
|
||||
|
||||
# insert
|
||||
tokens = torch.tensor([1, 2, 3]).to(device)
|
||||
roi = (tokens > 0)
|
||||
if my_rank == 0:
|
||||
key = 2.0 * torch.ones([5, 6]).to(device)
|
||||
value = 3.0 * torch.ones([5, 6]).to(device)
|
||||
|
||||
placeholder = torch.tensor([1]).to(device)
|
||||
|
||||
buffer.insert(tokens, roi, key, value, placeholder)
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
# drop_select
|
||||
if my_rank == 1:
|
||||
tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
|
||||
assert torch.allclose(tokens, tok)
|
||||
assert torch.allclose(roi, roi_)
|
||||
assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
|
||||
assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
|
||||
torch.distributed.barrier()
|
||||
|
||||
if my_rank == 0:
|
||||
assert buffer.buffer_size == 0
|
||||
assert len(buffer.buffer) == 0
|
||||
|
||||
print("Test run passed!")
|
||||
|
||||
|
||||
def stress_test(my_rank, buf, device):
|
||||
|
||||
torch.distributed.barrier()
|
||||
torch.manual_seed(100)
|
||||
|
||||
reqs = [
|
||||
(
|
||||
torch.rand(100).to(device), # tokens
|
||||
torch.ones(100).bool().to(device), # roi
|
||||
torch.rand(100).to(device), # key
|
||||
torch.rand(100).to(device), # value
|
||||
torch.rand(100).to(device), # hidden
|
||||
) for i in tqdm(range(200))
|
||||
]
|
||||
|
||||
random.seed(my_rank)
|
||||
random.shuffle(reqs)
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
n = 0
|
||||
|
||||
# the buffer size can only store 100 reqs
|
||||
# so the sender will occasionally block to wait for the receiver.
|
||||
for req in tqdm(reqs):
|
||||
if my_rank == 0:
|
||||
buf.insert(*req)
|
||||
else:
|
||||
tok, roi, k, v, h = req
|
||||
tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
|
||||
|
||||
if tok_ is None:
|
||||
assert roi_ is None
|
||||
assert k_ is None
|
||||
assert v_ is None
|
||||
assert h_ is None
|
||||
n += 1
|
||||
else:
|
||||
assert torch.allclose(tok, tok_)
|
||||
assert torch.allclose(roi, roi_)
|
||||
assert torch.allclose(k, k_)
|
||||
assert torch.allclose(v, v_)
|
||||
assert torch.allclose(h, h_)
|
||||
print('Rank %d done' % my_rank)
|
||||
torch.distributed.barrier()
|
||||
|
||||
if my_rank == 0:
|
||||
x = torch.tensor([0])
|
||||
torch.distributed.recv(x, 1)
|
||||
# the # of None received is the kv that are not selected
|
||||
assert x.item() == len(buf.buffer)
|
||||
# and the size of the buffer should be 2000 * buffer len
|
||||
print(buf.buffer_size)
|
||||
assert buf.buffer_size == 1700 * len(buf.buffer)
|
||||
else:
|
||||
torch.distributed.send(torch.tensor([n]), 0)
|
||||
|
||||
print("Passed stress test!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
my_rank = int(os.environ['RANK'])
|
||||
|
||||
torch.distributed.init_process_group(
|
||||
backend='gloo',
|
||||
init_method='tcp://localhost:12398',
|
||||
world_size=2,
|
||||
rank=my_rank,
|
||||
)
|
||||
|
||||
print("initialized! My rank is %d" % my_rank)
|
||||
|
||||
config = KVTransferConfig(
|
||||
kv_connector='PyNcclConnector',
|
||||
kv_buffer_device='cuda',
|
||||
kv_buffer_size=1e9,
|
||||
kv_rank=my_rank,
|
||||
kv_role="kv_both", # this arg doesn't matter in this test
|
||||
kv_parallel_size=2,
|
||||
kv_ip="127.0.0.1",
|
||||
kv_port=12345,
|
||||
)
|
||||
|
||||
data_pipe = PyNcclPipe(
|
||||
local_rank=my_rank,
|
||||
config=config,
|
||||
device="cuda",
|
||||
port_offset=0,
|
||||
)
|
||||
cpu_pipe = PyNcclPipe(
|
||||
local_rank=my_rank,
|
||||
config=config,
|
||||
device="cpu",
|
||||
port_offset=1,
|
||||
)
|
||||
|
||||
buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
|
||||
|
||||
test_run(my_rank, buffer, data_pipe.device)
|
||||
|
||||
stress_test(my_rank, buffer, data_pipe.device)
|
||||
|
||||
buffer.close()
|
||||
data_pipe.close()
|
||||
cpu_pipe.close()
|
||||
print('Done')
|
||||
3
tests/kv_transfer/test_lookup_buffer.sh
Normal file
3
tests/kv_transfer/test_lookup_buffer.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
RANK=0 python test_lookup_buffer.py &
|
||||
RANK=1 python test_lookup_buffer.py &
|
||||
155
tests/kv_transfer/test_send_recv.py
Normal file
155
tests/kv_transfer/test_send_recv.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import os
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
|
||||
|
||||
|
||||
def test_run(my_rank, pipe):
|
||||
# test run
|
||||
x = torch.tensor([1]).to(pipe.device)
|
||||
y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
|
||||
if my_rank == 0:
|
||||
pipe.send_tensor(x)
|
||||
print("sent tensor x")
|
||||
pipe.send_tensor(y)
|
||||
print("sent tensor y")
|
||||
x2 = pipe.recv_tensor()
|
||||
print("received x2 = ", x2)
|
||||
y2 = pipe.recv_tensor()
|
||||
print("received y2 = ", x2)
|
||||
|
||||
else:
|
||||
x2 = pipe.recv_tensor()
|
||||
print("received x2 = ", x2)
|
||||
y2 = pipe.recv_tensor()
|
||||
print("received y2 = ", x2)
|
||||
pipe.send_tensor(x)
|
||||
print("sent tensor x")
|
||||
pipe.send_tensor(y)
|
||||
print("sent tensor y")
|
||||
|
||||
assert torch.allclose(x, x2)
|
||||
assert torch.allclose(y, y2)
|
||||
|
||||
|
||||
def stress_test(my_rank, pipe):
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
tensors: List[torch.Tensor] = []
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
for i in tqdm(range(500)):
|
||||
mean = torch.rand(1).item() * 100
|
||||
std = torch.rand(1).item() * 100
|
||||
size = torch.randint(900, 1000, (2, ))
|
||||
x = torch.normal(mean * 1.0, std * 1.0,
|
||||
size=size.tolist()).to(pipe.device)
|
||||
|
||||
# 5% probability of sending a None
|
||||
if torch.rand(1).item() < 0.05:
|
||||
tensors.append(None)
|
||||
tensors.append(None)
|
||||
tensors.append(None)
|
||||
else:
|
||||
tensors.append(x)
|
||||
tensors.append(x.mean().unsqueeze(0))
|
||||
tensors.append(x.std().unsqueeze(0))
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
for i in tqdm(range(500)):
|
||||
if my_rank == int((i % 10) > 3):
|
||||
pipe.send_tensor(tensors[3 * i])
|
||||
pipe.send_tensor(tensors[3 * i + 1])
|
||||
pipe.send_tensor(tensors[3 * i + 2])
|
||||
else:
|
||||
x = pipe.recv_tensor()
|
||||
mean = pipe.recv_tensor()
|
||||
std = pipe.recv_tensor()
|
||||
|
||||
if x is None:
|
||||
assert mean is None
|
||||
assert std is None
|
||||
else:
|
||||
assert torch.allclose(x, tensors[3 * i])
|
||||
assert x.mean() == mean[0]
|
||||
assert x.std() == std[0]
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
|
||||
def latency_test(my_rank, pipe, nelement, ntensor):
|
||||
|
||||
latencies = []
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
for i in tqdm(range(500)):
|
||||
|
||||
tensors = []
|
||||
|
||||
if my_rank == 0:
|
||||
# create tensor
|
||||
tensors = [
|
||||
torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
|
||||
]
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
if my_rank == 0:
|
||||
t = torch.tensor([time.time()],
|
||||
dtype=torch.float64).to(pipe.device)
|
||||
for tensor in tensors:
|
||||
pipe.send_tensor(tensor)
|
||||
pipe.send_tensor(t)
|
||||
else:
|
||||
for _ in range(ntensor):
|
||||
pipe.recv_tensor()
|
||||
t = pipe.recv_tensor()
|
||||
latencies.append(time.time() - t.item())
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
print('Latency test passed.')
|
||||
print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
my_rank = int(os.environ['RANK'])
|
||||
|
||||
torch.distributed.init_process_group(
|
||||
backend='gloo',
|
||||
init_method='tcp://localhost:12398',
|
||||
world_size=2,
|
||||
rank=my_rank,
|
||||
)
|
||||
|
||||
config = KVTransferConfig(
|
||||
kv_connector='PyNcclConnector',
|
||||
kv_buffer_device='cuda',
|
||||
kv_buffer_size=1e9,
|
||||
kv_rank=my_rank,
|
||||
kv_role="kv_both", # this arg doesn't matter in this test
|
||||
kv_parallel_size=2,
|
||||
kv_ip="127.0.0.1",
|
||||
kv_port=12345,
|
||||
)
|
||||
|
||||
pipe = PyNcclPipe(
|
||||
local_rank=my_rank,
|
||||
config=config,
|
||||
)
|
||||
|
||||
test_run(my_rank, pipe)
|
||||
stress_test(my_rank, pipe)
|
||||
|
||||
# Use this function if you want to test the latency of pipe impl.
|
||||
# latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
|
||||
3
tests/kv_transfer/test_send_recv.sh
Normal file
3
tests/kv_transfer/test_send_recv.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
RANK=0 python3 test_send_recv.py &
|
||||
RANK=1 python3 test_send_recv.py &
|
||||
Reference in New Issue
Block a user