group: LM Eval depends_on: - image-build steps: - label: LM Eval Small Models timeout_in_minutes: 75 source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization autorun_on_main: true commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: LM Eval Large Models (4 GPUs)(A100) gpu: a100 optional: true num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - label: LM Eval Large Models (4 GPUs)(H100) gpu: h100 optional: true num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - label: LM Eval Small Models (B200) timeout_in_minutes: 120 gpu: b200 optional: true source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt