From a21cedf4ff1facaee601a635e3c092fe02742290 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 16 Feb 2026 14:24:35 +0100 Subject: [PATCH] Bump `lm-eval` version for Transformers v5 compatibility (#33994) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../run-lm-eval-chartqa-vllm-vlm-baseline.sh | 2 +- .../run-lm-eval-gsm-hf-baseline.sh | 2 +- .../run-lm-eval-gsm-vllm-baseline.sh | 2 +- .../run-lm-eval-mmlupro-vllm-baseline.sh | 2 +- .../hardware_ci/run-tpu-v1-test-part2.sh | 2 +- .../scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- docs/features/quantization/fp8.md | 2 +- docs/features/quantization/int4.md | 2 +- docs/features/quantization/int8.md | 2 +- docs/features/quantization/quark.md | 2 +- requirements/nightly_torch_test.txt | 2 +- requirements/rocm-test.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 24 +++++-------------- 14 files changed, 19 insertions(+), 31 deletions(-) diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh index 0745da8dc..02371f3dd 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on chartqa for vllm. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index 5c17a0624..f010ffe67 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 1b617ff17..fec4a94e6 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh index 12336d7f8..c5128cea6 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index 6959f81ea..6ec6ab94f 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index eafc82b98..feaf2b356 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 76fc04710..6034b0496 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio Install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm "lm-eval[api]>=0.4.9.2" +pip install vllm "lm-eval[api]>=0.4.11" ``` Load and run the model in `vllm`: diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index 049a7ceed..ed8a08a6a 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -18,7 +18,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm "lm-eval[api]>=0.4.9.2" +pip install vllm "lm-eval[api]>=0.4.11" ``` ## Quantization Process diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 8af3e24c7..18965aed3 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -23,7 +23,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm "lm-eval[api]>=0.4.9.2" +pip install vllm "lm-eval[api]>=0.4.11" ``` ## Quantization Process diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index bbab97740..1961d7309 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -20,7 +20,7 @@ for more installation details. Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm "lm-eval[api]>=0.4.9.2" +pip install vllm "lm-eval[api]>=0.4.11" ``` ## Quantization Process diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index cc5ea519a..c9211b913 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.9.1 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.13.0 # required for video test datamodel_code_generator # required for minicpm3 test -lm-eval[api]>=0.4.9.2 # required for model evaluation test +lm-eval[api]>=0.4.11 # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test transformers==4.57.5 tokenizers==0.22.0 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index c5bc6048d..070c18363 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -58,7 +58,7 @@ schemathesis==3.39.15 # OpenAI schema test # Evaluation and benchmarking -lm-eval[api]==0.4.9.2 +lm-eval[api]==0.4.11 jiwer==4.0.0 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test diff --git a/requirements/test.in b/requirements/test.in index 18a80433d..5faf1c456 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -35,7 +35,7 @@ num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py opencv-python-headless >= 4.13.0 # required for video test datamodel_code_generator # required for minicpm3 test -lm-eval[api]>=0.4.9.2 # required for model evaluation test +lm-eval[api]>=0.4.11 # required for model evaluation test mteb[bm25s]>=2, <3 # required for mteb test transformers==4.57.5 tokenizers==0.22.0 diff --git a/requirements/test.txt b/requirements/test.txt index 72583587e..c18d21637 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -5,9 +5,7 @@ absl-py==2.1.0 # rouge-score # tensorboard accelerate==1.0.1 - # via - # lm-eval - # peft + # via peft aenum==3.1.16 # via lightly affine==2.4.0 @@ -138,7 +136,6 @@ colorama==0.4.6 # perceptron # sacrebleu # schemathesis - # tqdm-multiprocess colorful==0.5.6 # via ray colorlog==6.10.1 @@ -383,6 +380,7 @@ jinja2==3.1.6 # via # datamodel-code-generator # genai-perf + # lm-eval # torch jiwer==3.0.5 # via -r requirements/test.in @@ -448,7 +446,7 @@ lightning-utilities==0.14.3 # torchmetrics llvmlite==0.44.0 # via numba -lm-eval==0.4.9.2 +lm-eval==0.4.11 # via -r requirements/test.in lxml==5.3.0 # via @@ -513,8 +511,6 @@ numba==0.61.2 # via # -r requirements/test.in # librosa -numexpr==2.10.1 - # via lm-eval numpy==2.2.6 # via # -r requirements/test.in @@ -540,11 +536,11 @@ numpy==2.2.6 # librosa # lightly # lightly-utils + # lm-eval # matplotlib # mistral-common # mteb # numba - # numexpr # opencv-python-headless # optuna # pandas @@ -707,9 +703,7 @@ pathvalidate==3.2.1 patsy==1.0.1 # via statsmodels peft==0.16.0 - # via - # -r requirements/test.in - # lm-eval + # via -r requirements/test.in perceptron==0.1.4 # via -r requirements/test.in perf-analyzer==0.1.0 @@ -792,8 +786,6 @@ pyasn1==0.6.1 # rsa pyasn1-modules==0.4.2 # via google-auth -pybind11==2.13.6 - # via lm-eval pycocotools==2.0.8 # via terratorch pycountry==24.6.1 @@ -1171,7 +1163,6 @@ torch==2.10.0+cu129 # kornia # lightly # lightning - # lm-eval # mteb # open-clip-torch # peft @@ -1229,15 +1220,11 @@ tqdm==4.67.3 # sentence-transformers # tacoreader # terratorch - # tqdm-multiprocess # transformers -tqdm-multiprocess==0.0.11 - # via lm-eval transformers==4.57.5 # via # -r requirements/test.in # genai-perf - # lm-eval # peft # sentence-transformers # transformers-stream-generator @@ -1272,6 +1259,7 @@ typing-extensions==4.15.0 # librosa # lightning # lightning-utilities + # lm-eval # mistral-common # mteb # opentelemetry-api