[CI][DeepSeek] Add nightly DeepSeek R1 lm_eval tests on H200 (#30356)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-01-05 17:17:59 -05:00
parent 32f4e4db00
commit 276e03b92c
5 changed files with 33 additions and 1 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1351,6 +1351,14 @@ steps:
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

+- label: LM Eval Large Models (H200) # optional
+  timeout_in_minutes: 60
+  gpu: h200
+  optional: true
+  num_gpus: 8
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
+
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
  gpu: b200
--- a/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
@@ -0,0 +1,11 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --data-parallel-size 8
+  --enable-expert-parallel
+  --speculative-config '{"method":"mtp","num_speculative_tokens":1}'
--- a/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
@@ -0,0 +1,11 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
+  --speculative-config '{"method":"mtp","num_speculative_tokens":1}'
--- a/tests/evals/gsm8k/configs/models-h200.txt
+++ b/tests/evals/gsm8k/configs/models-h200.txt
@@ -0,0 +1,2 @@
+DeepSeek-R1-TP.yaml
+DeepSeek-R1-DP.yaml
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -78,7 +78,7 @@ def test_gsm8k_correctness(config_filename):
        eval_config["model_name"],
        server_args,
        env_dict=env_dict,
-        max_wait_seconds=600,
+        max_wait_seconds=eval_config.get("startup_max_wait_seconds", 600),
    ) as remote_server:
        server_url = remote_server.url_for("v1")
        print(f"Server started at: {server_url}")