102 lines
3.1 KiB
YAML
102 lines
3.1 KiB
YAML
group: LM Eval
|
|
depends_on:
|
|
- image-build
|
|
steps:
|
|
- label: LM Eval Small Models
|
|
timeout_in_minutes: 75
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
autorun_on_main: true
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
|
|
|
- label: LM Eval Large Models (4 GPUs)(A100)
|
|
device: a100
|
|
optional: true
|
|
num_devices: 4
|
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
commands:
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
|
|
|
- label: LM Eval Large Models (4 GPUs)(H100)
|
|
device: h100
|
|
optional: true
|
|
num_devices: 4
|
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
commands:
|
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
|
|
|
- label: LM Eval Small Models (B200)
|
|
timeout_in_minutes: 120
|
|
device: b200
|
|
optional: true
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
|
|
|
- label: LM Eval Large Models (H200)
|
|
timeout_in_minutes: 60
|
|
device: h200
|
|
optional: true
|
|
num_devices: 8
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
|
|
|
|
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
|
|
device: h100
|
|
optional: true
|
|
num_devices: 2
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
|
|
|
|
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
|
|
device: b200
|
|
optional: true
|
|
num_devices: 2
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
|
|
|
|
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
|
|
device: b200
|
|
optional: true
|
|
num_devices: 2
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
|
|
|
- label: GPQA Eval (GPT-OSS) (H100)
|
|
timeout_in_minutes: 120
|
|
device: h100
|
|
optional: true
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
- tests/evals/gpt_oss/
|
|
commands:
|
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
|
|
|
|
- label: GPQA Eval (GPT-OSS) (B200)
|
|
timeout_in_minutes: 120
|
|
device: b200
|
|
optional: true
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
- tests/evals/gpt_oss/
|
|
commands:
|
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
|