[Bugfix][Frontend] Fix Issues Under High Load With zeromq Frontend (#7394)

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-08-21 13:34:14 -04:00
parent d3c002eadc
commit f7e3b0c5aa
9 changed files with 322 additions and 141 deletions
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -0,0 +1,55 @@
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.58
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "4096", "--enable-chunked-prefill",
+        "--disable-log-requests", "--enforce-eager"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_data(server):
+    return {
+        "url": f"{server.url_for('v1')}/completions",
+    }
+
+
+def test_lm_eval_accuracy(server_data):
+    model_args = (f"model={MODEL_NAME},"
+                  f"base_url={server_data['url']},"
+                  f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+    results = lm_eval.simple_evaluate(
+        model="local-completions",
+        model_args=model_args,
+        tasks=TASK,
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"