From a21cedf4ff1facaee601a635e3c092fe02742290 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 16 Feb 2026 14:24:35 +0100
Subject: [PATCH] Bump `lm-eval` version for Transformers v5 compatibility
 (#33994)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../run-lm-eval-chartqa-vllm-vlm-baseline.sh  |  2 +-
 .../run-lm-eval-gsm-hf-baseline.sh            |  2 +-
 .../run-lm-eval-gsm-vllm-baseline.sh          |  2 +-
 .../run-lm-eval-mmlupro-vllm-baseline.sh      |  2 +-
 .../hardware_ci/run-tpu-v1-test-part2.sh      |  2 +-
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |  2 +-
 docs/features/quantization/fp8.md             |  2 +-
 docs/features/quantization/int4.md            |  2 +-
 docs/features/quantization/int8.md            |  2 +-
 docs/features/quantization/quark.md           |  2 +-
 requirements/nightly_torch_test.txt           |  2 +-
 requirements/rocm-test.txt                    |  2 +-
 requirements/test.in                          |  2 +-
 requirements/test.txt                         | 24 +++++--------------
 14 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
index 0745da8dc..02371f3dd 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index 5c17a0624..f010ffe67 100755
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 1b617ff17..fec4a94e6 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
index 12336d7f8..c5128cea6 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 
 usage() {
     echo``
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index 6959f81ea..6ec6ab94f 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index eafc82b98..feaf2b356 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
 
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 76fc04710..6034b0496 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 Install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 
 Load and run the model in `vllm`:
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 049a7ceed..ed8a08a6a 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -18,7 +18,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 
 ## Quantization Process
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 8af3e24c7..18965aed3 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -23,7 +23,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 
 ## Quantization Process
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index bbab97740..1961d7309 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -20,7 +20,7 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 
 ## Quantization Process
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index cc5ea519a..c9211b913 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.9.1 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]>=0.4.9.2 # required for model evaluation test
+lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.57.5
 tokenizers==0.22.0
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index c5bc6048d..070c18363 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -58,7 +58,7 @@ schemathesis==3.39.15
     # OpenAI schema test
 
 # Evaluation and benchmarking
-lm-eval[api]==0.4.9.2
+lm-eval[api]==0.4.11
 jiwer==4.0.0
 
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
diff --git a/requirements/test.in b/requirements/test.in
index 18a80433d..5faf1c456 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -35,7 +35,7 @@ num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]>=0.4.9.2 # required for model evaluation test
+lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.5
 tokenizers==0.22.0
diff --git a/requirements/test.txt b/requirements/test.txt
index 72583587e..c18d21637 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -5,9 +5,7 @@ absl-py==2.1.0
     #   rouge-score
     #   tensorboard
 accelerate==1.0.1
-    # via
-    #   lm-eval
-    #   peft
+    # via peft
 aenum==3.1.16
     # via lightly
 affine==2.4.0
@@ -138,7 +136,6 @@ colorama==0.4.6
     #   perceptron
     #   sacrebleu
     #   schemathesis
-    #   tqdm-multiprocess
 colorful==0.5.6
     # via ray
 colorlog==6.10.1
@@ -383,6 +380,7 @@ jinja2==3.1.6
     # via
     #   datamodel-code-generator
     #   genai-perf
+    #   lm-eval
     #   torch
 jiwer==3.0.5
     # via -r requirements/test.in
@@ -448,7 +446,7 @@ lightning-utilities==0.14.3
     #   torchmetrics
 llvmlite==0.44.0
     # via numba
-lm-eval==0.4.9.2
+lm-eval==0.4.11
     # via -r requirements/test.in
 lxml==5.3.0
     # via
@@ -513,8 +511,6 @@ numba==0.61.2
     # via
     #   -r requirements/test.in
     #   librosa
-numexpr==2.10.1
-    # via lm-eval
 numpy==2.2.6
     # via
     #   -r requirements/test.in
@@ -540,11 +536,11 @@ numpy==2.2.6
     #   librosa
     #   lightly
     #   lightly-utils
+    #   lm-eval
     #   matplotlib
     #   mistral-common
     #   mteb
     #   numba
-    #   numexpr
     #   opencv-python-headless
     #   optuna
     #   pandas
@@ -707,9 +703,7 @@ pathvalidate==3.2.1
 patsy==1.0.1
     # via statsmodels
 peft==0.16.0
-    # via
-    #   -r requirements/test.in
-    #   lm-eval
+    # via -r requirements/test.in
 perceptron==0.1.4
     # via -r requirements/test.in
 perf-analyzer==0.1.0
@@ -792,8 +786,6 @@ pyasn1==0.6.1
     #   rsa
 pyasn1-modules==0.4.2
     # via google-auth
-pybind11==2.13.6
-    # via lm-eval
 pycocotools==2.0.8
     # via terratorch
 pycountry==24.6.1
@@ -1171,7 +1163,6 @@ torch==2.10.0+cu129
     #   kornia
     #   lightly
     #   lightning
-    #   lm-eval
     #   mteb
     #   open-clip-torch
     #   peft
@@ -1229,15 +1220,11 @@ tqdm==4.67.3
     #   sentence-transformers
     #   tacoreader
     #   terratorch
-    #   tqdm-multiprocess
     #   transformers
-tqdm-multiprocess==0.0.11
-    # via lm-eval
 transformers==4.57.5
     # via
     #   -r requirements/test.in
     #   genai-perf
-    #   lm-eval
     #   peft
     #   sentence-transformers
     #   transformers-stream-generator
@@ -1272,6 +1259,7 @@ typing-extensions==4.15.0
     #   librosa
     #   lightning
     #   lightning-utilities
+    #   lm-eval
     #   mistral-common
     #   mteb
     #   opentelemetry-api