[ci][test] add correctness test for cpu offloading (#6549)

2024-07-18 16:41:06 -07:00
parent 2d4733ba2d
commit f53b8f0d05
4 changed files with 105 additions and 85 deletions
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,6 +10,7 @@ from typing import Any, Dict, List
 import openai
 import ray
 import requests
+from transformers import AutoTokenizer

 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
@@ -124,6 +125,99 @@ class RemoteOpenAIServer:
        )


+def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
+    """
+    Launch API server with two different sets of arguments and compare the
+    results of the API calls. The arguments are after the model name.
+    """
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
+    prompt = "Hello, my name is"
+    token_ids = tokenizer(prompt)["input_ids"]
+    results = []
+    for args in (arg1, arg2):
+        with RemoteOpenAIServer(model, args) as server:
+            client = server.get_client()
+
+            # test models list
+            models = client.models.list()
+            models = models.data
+            served_model = models[0]
+            results.append({
+                "test": "models_list",
+                "id": served_model.id,
+                "root": served_model.root,
+            })
+
+            # test with text prompt
+            completion = client.completions.create(model=model,
+                                                   prompt=prompt,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+            results.append({
+                "test": "single_completion",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test using token IDs
+            completion = client.completions.create(
+                model=model,
+                prompt=token_ids,
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "token_ids",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test simple list
+            batch = client.completions.create(
+                model=model,
+                prompt=[prompt, prompt],
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "simple_list",
+                "text0": batch.choices[0].text,
+                "text1": batch.choices[1].text,
+            })
+
+            # test streaming
+            batch = client.completions.create(
+                model=model,
+                prompt=[prompt, prompt],
+                max_tokens=5,
+                temperature=0.0,
+                stream=True,
+            )
+            texts = [""] * 2
+            for chunk in batch:
+                assert len(chunk.choices) == 1
+                choice = chunk.choices[0]
+                texts[choice.index] += choice.text
+            results.append({
+                "test": "streaming",
+                "texts": texts,
+            })
+
+    n = len(results) // 2
+    arg1_results = results[:n]
+    arg2_results = results[n:]
+    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
+        assert arg1_result == arg2_result, \
+            f"Results for {model=} are not the same with {arg1=} and {arg2=}"
+
+
 def init_test_distributed_environment(
    tp_size: int,
    pp_size: int,