group: LM Eval depends_on: - image-build steps: - label: LM Eval Small Models timeout_in_minutes: 75 source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization autorun_on_main: true commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: LM Eval Large Models (4 GPUs)(A100) device: a100 optional: true num_devices: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - label: LM Eval Large Models (4 GPUs)(H100) device: h100 optional: true num_devices: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - label: LM Eval Small Models (B200) timeout_in_minutes: 120 device: b200 optional: true source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - label: LM Eval Large Models (H200) timeout_in_minutes: 60 device: h200 optional: true num_devices: 8 commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt - label: MoE Refactor Integration Test (H100 - TEMPORARY) device: h100 optional: true num_devices: 2 commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt - label: MoE Refactor Integration Test (B200 - TEMPORARY) device: b200 optional: true num_devices: 2 commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt - label: MoE Refactor Integration Test (B200 DP - TEMPORARY) device: b200 optional: true num_devices: 2 commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt - label: GPQA Eval (GPT-OSS) (H100) timeout_in_minutes: 120 device: h100 optional: true num_devices: 2 source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization - tests/evals/gpt_oss/ commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt - label: GPQA Eval (GPT-OSS) (B200) timeout_in_minutes: 120 device: b200 optional: true num_devices: 2 source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization - tests/evals/gpt_oss/ commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt