Bump `lm-eval` version for Transformers v5 compatibility (#33994)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Bump `lm-eval` version for Transformers v5 compatibility (#33994)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
a21cedf4 · Harry Mellor · GitHub · 3ef74cde · a21cedf4 · a21cedf4
Unverified Commit a21cedf4 authored Feb 16, 2026 by Harry Mellor Committed by GitHub Feb 16, 2026
14 changed files
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 Install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 Load and run the model in `vllm`:

--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -18,7 +18,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 ## Quantization Process

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -23,7 +23,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 ## Quantization Process

--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -20,7 +20,7 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm "lm-eval[api]>=0.4.9.2"
+pip install vllm "lm-eval[api]>=0.4.11"
 ```
 ## Quantization Process

--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.9.1 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]>=0.4.9.2 # required for model evaluation test
+lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.57.5
 tokenizers==0.22.0

--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -58,7 +58,7 @@ schemathesis==3.39.15
    # OpenAI schema test
 # Evaluation and benchmarking
-lm-eval[api]==0.4.9.2
+lm-eval[api]==0.4.11
 jiwer==4.0.0
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -35,7 +35,7 @@ num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]>=0.4.9.2 # required for model evaluation test
+lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.5
 tokenizers==0.22.0

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -5,9 +5,7 @@ absl-py==2.1.0
    #   rouge-score
    #   tensorboard
 accelerate==1.0.1
-    # via
+    # via peft
-    #   lm-eval
-    #   peft
 aenum==3.1.16
    # via lightly
 affine==2.4.0
@@ -138,7 +136,6 @@ colorama==0.4.6
    #   perceptron
    #   sacrebleu
    #   schemathesis
-    #   tqdm-multiprocess
 colorful==0.5.6
    # via ray
 colorlog==6.10.1
@@ -383,6 +380,7 @@ jinja2==3.1.6
    # via
    #   datamodel-code-generator
    #   genai-perf
+    #   lm-eval
    #   torch
 jiwer==3.0.5
    # via -r requirements/test.in
@@ -448,7 +446,7 @@ lightning-utilities==0.14.3
    #   torchmetrics
 llvmlite==0.44.0
    # via numba
-lm-eval==0.4.9.2
+lm-eval==0.4.11
    # via -r requirements/test.in
 lxml==5.3.0
    # via
@@ -513,8 +511,6 @@ numba==0.61.2
    # via
    #   -r requirements/test.in
    #   librosa
-numexpr==2.10.1
-    # via lm-eval
 numpy==2.2.6
    # via
    #   -r requirements/test.in
@@ -540,11 +536,11 @@ numpy==2.2.6
    #   librosa
    #   lightly
    #   lightly-utils
+    #   lm-eval
    #   matplotlib
    #   mistral-common
    #   mteb
    #   numba
-    #   numexpr
    #   opencv-python-headless
    #   optuna
    #   pandas
@@ -707,9 +703,7 @@ pathvalidate==3.2.1
 patsy==1.0.1
    # via statsmodels
 peft==0.16.0
-    # via
+    # via -r requirements/test.in
-    #   -r requirements/test.in
-    #   lm-eval
 perceptron==0.1.4
    # via -r requirements/test.in
 perf-analyzer==0.1.0
@@ -792,8 +786,6 @@ pyasn1==0.6.1
    #   rsa
 pyasn1-modules==0.4.2
    # via google-auth
-pybind11==2.13.6
-    # via lm-eval
 pycocotools==2.0.8
    # via terratorch
 pycountry==24.6.1
@@ -1171,7 +1163,6 @@ torch==2.10.0+cu129
    #   kornia
    #   lightly
    #   lightning
-    #   lm-eval
    #   mteb
    #   open-clip-torch
    #   peft
@@ -1229,15 +1220,11 @@ tqdm==4.67.3
    #   sentence-transformers
    #   tacoreader
    #   terratorch
-    #   tqdm-multiprocess
    #   transformers
-tqdm-multiprocess==0.0.11
-    # via lm-eval
 transformers==4.57.5
    # via
    #   -r requirements/test.in
    #   genai-perf
-    #   lm-eval
    #   peft
    #   sentence-transformers
    #   transformers-stream-generator
@@ -1272,6 +1259,7 @@ typing-extensions==4.15.0
    #   librosa
    #   lightning
    #   lightning-utilities
+    #   lm-eval
    #   mistral-common
    #   mteb
    #   opentelemetry-api