[ci][test] add correctness test for cpu offloading (#6549)

This commit is contained in:
youkaichao
2024-07-18 16:41:06 -07:00
committed by GitHub
parent 2d4733ba2d
commit f53b8f0d05
4 changed files with 105 additions and 85 deletions

View File

@@ -10,6 +10,7 @@ from typing import Any, Dict, List
import openai
import ray
import requests
from transformers import AutoTokenizer
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
@@ -124,6 +125,99 @@ class RemoteOpenAIServer:
)
def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
"""
Launch API server with two different sets of arguments and compare the
results of the API calls. The arguments are after the model name.
"""
tokenizer = AutoTokenizer.from_pretrained(model)
prompt = "Hello, my name is"
token_ids = tokenizer(prompt)["input_ids"]
results = []
for args in (arg1, arg2):
with RemoteOpenAIServer(model, args) as server:
client = server.get_client()
# test models list
models = client.models.list()
models = models.data
served_model = models[0]
results.append({
"test": "models_list",
"id": served_model.id,
"root": served_model.root,
})
# test with text prompt
completion = client.completions.create(model=model,
prompt=prompt,
max_tokens=5,
temperature=0.0)
results.append({
"test": "single_completion",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test using token IDs
completion = client.completions.create(
model=model,
prompt=token_ids,
max_tokens=5,
temperature=0.0,
)
results.append({
"test": "token_ids",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test simple list
batch = client.completions.create(
model=model,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
)
results.append({
"test": "simple_list",
"text0": batch.choices[0].text,
"text1": batch.choices[1].text,
})
# test streaming
batch = client.completions.create(
model=model,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
results.append({
"test": "streaming",
"texts": texts,
})
n = len(results) // 2
arg1_results = results[:n]
arg2_results = results[n:]
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
assert arg1_result == arg2_result, \
f"Results for {model=} are not the same with {arg1=} and {arg2=}"
def init_test_distributed_environment(
tp_size: int,
pp_size: int,