[Deprecation] Remove `prompt_token_ids` arg fallback in `LLM.generate` and `LLM.embed` (#18800)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Deprecation] Remove `prompt_token_ids` arg fallback in `LLM.generate` and `LLM.embed` (#18800)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
8896eb72 · Cyrus Leung · GitHub · 19fe1a05 · 8896eb72 · 8896eb72
Unverified Commit 8896eb72 authored Aug 22, 2025 by Cyrus Leung Committed by GitHub Aug 22, 2025
20 changed files
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 usage() {
    echo``

--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -71,7 +71,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
    && rm -rf vllm \
    && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install lm-eval[api]==0.4.4 \
+    && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
    && python3 -m pip install pytest-shard
 # -----------------------

--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -79,7 +79,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 Install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 Load and run the model in `vllm`:

--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -18,7 +18,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 ## Quantization Process

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -19,7 +19,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 ## Quantization Process

--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -20,7 +20,7 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 ## Quantization Process

--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -5,6 +5,7 @@ from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 from vllm.benchmarks.datasets import add_dataset_parser, get_samples
+from vllm.inputs import TokensPrompt
 from vllm.v1.metrics.reader import Counter, Vector
 try:
@@ -137,7 +138,8 @@ def main():
    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
    if not args.custom_mm_prompts:
        outputs = llm.generate(
-            prompt_token_ids=prompt_ids, sampling_params=sampling_params
+            TokensPrompt(prompt_token_ids=prompt_ids),
+            sampling_params=sampling_params,
        )
    else:
        outputs = llm.chat(prompts, sampling_params=sampling_params)

--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -85,7 +85,7 @@ def format_output(title: str, output: str):
 def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
-    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+    outputs = llm.generate(prompt, sampling_params=sampling_params)
    return outputs[0].outputs[0].text

--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.2 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.8 # required for model evaluation test
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.52.4
 tokenizers==0.21.1

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -32,7 +32,8 @@ num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.8 # required for model evaluation test
+# TODO: Use lm-eval[api]==0.4.10 once released
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
 transformers==4.55.2
 tokenizers==0.21.1

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -408,7 +408,7 @@ lightning-utilities==0.14.3
    #   torchmetrics
 llvmlite==0.44.0
    # via numba
-lm-eval==0.4.8
+lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
    # via -r requirements/test.in
 lxml==5.3.0
    # via

--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -18,7 +18,6 @@ def text_llm():
              enforce_eager=True,
              seed=0)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm
@@ -88,7 +87,6 @@ def vision_llm():
        seed=0,
    )
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm
@@ -158,7 +156,6 @@ def thinking_llm():
        seed=0,
    )
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm

--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -35,7 +35,6 @@ def llm():
              enforce_eager=True,
              seed=0)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm

--- a/tests/entrypoints/llm/test_embedding.py
+++ b/tests/entrypoints/llm/test_embedding.py
@@ -26,7 +26,6 @@ def llm():
              enforce_eager=True,
              seed=0)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm

--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -5,11 +5,9 @@ import weakref
 import pytest
-from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
-from ...models.utils import check_embeddings_close
 MODEL_NAME = "intfloat/multilingual-e5-small"
 PROMPTS = [
@@ -48,7 +46,6 @@ def llm():
              enforce_eager=True,
              seed=0)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm
@@ -56,49 +53,6 @@ def llm():
    cleanup_dist_env_and_memory()
-def assert_outputs_match(o1: list[PoolingRequestOutput],
-                         o2: list[PoolingRequestOutput]):
-    check_embeddings_close(
-        embeddings_0_lst=[o.outputs.data for o in o1],
-        embeddings_1_lst=[o.outputs.data for o in o2],
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
-def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
-                                                    prompt_token_ids):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
-                               pooling_params=pooling_params)
-    v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
-                           pooling_params=pooling_params)
-    assert_outputs_match(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
-    pooling_params = PoolingParams()
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
-                               pooling_params=pooling_params)
-    v2_output = llm.encode(
-        [{
-            "prompt_token_ids": p
-        } for p in TOKEN_IDS],
-        pooling_params=pooling_params,
-    )
-    assert_outputs_match(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_multiple_pooling_params(llm: LLM):
    pooling_params = [

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -5,7 +5,7 @@ import weakref
 import pytest
-from vllm import LLM, RequestOutput, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 MODEL_NAME = "distilbert/distilgpt2"
@@ -41,7 +41,6 @@ def llm():
              gpu_memory_utilization=0.10,
              enforce_eager=True)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm
@@ -49,42 +48,6 @@ def llm():
    cleanup_dist_env_and_memory()
-def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
-    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
-def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
-                                                    prompt_token_ids):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
-                             sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
-                                 sampling_params=sampling_params)
-    v2_output = llm.generate(
-        [{
-            "prompt_token_ids": p
-        } for p in TOKEN_IDS],
-        sampling_params=sampling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
 def test_multiple_sampling_params(llm: LLM):
    sampling_params = [

--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -48,7 +48,6 @@ def llm(request, monkeypatch_module):
              max_num_seqs=128,
              enforce_eager=True)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm

--- a/tests/entrypoints/llm/test_reward.py
+++ b/tests/entrypoints/llm/test_reward.py
@@ -36,7 +36,6 @@ def llm():
              trust_remote_code=True,
              seed=0)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm

--- a/tests/entrypoints/llm/test_score.py
+++ b/tests/entrypoints/llm/test_score.py
@@ -33,7 +33,6 @@ def llm():
              enforce_eager=True,
              seed=0)
-    with llm.deprecate_legacy_api():
    yield weakref.proxy(llm)
    del llm