diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt index 4fb0b84bc..a9a60f348 100644 --- a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt +++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt @@ -1 +1,2 @@ Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml +Qwen3-235B-A22B-Instruct-2507-FP8.yaml diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6c35e0db1..ab8bf9d23 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1544,8 +1544,8 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -##### H100 test ##### -- label: LM Eval Large Models (H100) # optional +##### FP8 test ##### +- label: LM Eval Large Models (H100) # optional, still use H100 for consistency gpu: h100 optional: true mirror_hardwares: [amdexperimental, amdproduction] @@ -1557,8 +1557,8 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4 ##### H200 test #####