Unverified Commit 8896eb72 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation] Remove `prompt_token_ids` arg fallback in `LLM.generate` and `LLM.embed` (#18800)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 19fe1a05
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4 # pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() { usage() {
echo`` echo``
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4 # pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() { usage() {
echo`` echo``
......
...@@ -71,7 +71,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace ...@@ -71,7 +71,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
RUN cd /vllm-workspace \ RUN cd /vllm-workspace \
&& rm -rf vllm \ && rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \ && python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4 \ && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
&& python3 -m pip install pytest-shard && python3 -m pip install pytest-shard
# ----------------------- # -----------------------
......
...@@ -79,7 +79,7 @@ Since simple RTN does not require data for weight quantization and the activatio ...@@ -79,7 +79,7 @@ Since simple RTN does not require data for weight quantization and the activatio
Install `vllm` and `lm-evaluation-harness` for evaluation: Install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm lm-eval==0.4.4 pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
``` ```
Load and run the model in `vllm`: Load and run the model in `vllm`:
......
...@@ -18,7 +18,7 @@ pip install llmcompressor ...@@ -18,7 +18,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm lm-eval==0.4.4 pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
``` ```
## Quantization Process ## Quantization Process
......
...@@ -19,7 +19,7 @@ pip install llmcompressor ...@@ -19,7 +19,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm lm-eval==0.4.4 pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
``` ```
## Quantization Process ## Quantization Process
......
...@@ -20,7 +20,7 @@ for more installation details. ...@@ -20,7 +20,7 @@ for more installation details.
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm lm-eval==0.4.4 pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
``` ```
## Quantization Process ## Quantization Process
......
...@@ -5,6 +5,7 @@ from transformers import AutoTokenizer ...@@ -5,6 +5,7 @@ from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.benchmarks.datasets import add_dataset_parser, get_samples from vllm.benchmarks.datasets import add_dataset_parser, get_samples
from vllm.inputs import TokensPrompt
from vllm.v1.metrics.reader import Counter, Vector from vllm.v1.metrics.reader import Counter, Vector
try: try:
...@@ -137,7 +138,8 @@ def main(): ...@@ -137,7 +138,8 @@ def main():
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
if not args.custom_mm_prompts: if not args.custom_mm_prompts:
outputs = llm.generate( outputs = llm.generate(
prompt_token_ids=prompt_ids, sampling_params=sampling_params TokensPrompt(prompt_token_ids=prompt_ids),
sampling_params=sampling_params,
) )
else: else:
outputs = llm.chat(prompts, sampling_params=sampling_params) outputs = llm.chat(prompts, sampling_params=sampling_params)
......
...@@ -85,7 +85,7 @@ def format_output(title: str, output: str): ...@@ -85,7 +85,7 @@ def format_output(title: str, output: str):
def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM): def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) outputs = llm.generate(prompt, sampling_params=sampling_params)
return outputs[0].outputs[0].text return outputs[0].outputs[0].text
......
...@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.2 # required for voxtral test ...@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.2 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test mteb>=1.38.11, <2 # required for mteb test
transformers==4.52.4 transformers==4.52.4
tokenizers==0.21.1 tokenizers==0.21.1
......
...@@ -32,7 +32,8 @@ num2words # required for smolvlm test ...@@ -32,7 +32,8 @@ num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test # TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.55.2 transformers==4.55.2
tokenizers==0.21.1 tokenizers==0.21.1
......
...@@ -408,7 +408,7 @@ lightning-utilities==0.14.3 ...@@ -408,7 +408,7 @@ lightning-utilities==0.14.3
# torchmetrics # torchmetrics
llvmlite==0.44.0 llvmlite==0.44.0
# via numba # via numba
lm-eval==0.4.8 lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# via -r requirements/test.in # via -r requirements/test.in
lxml==5.3.0 lxml==5.3.0
# via # via
......
...@@ -18,7 +18,6 @@ def text_llm(): ...@@ -18,7 +18,6 @@ def text_llm():
enforce_eager=True, enforce_eager=True,
seed=0) seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
...@@ -88,7 +87,6 @@ def vision_llm(): ...@@ -88,7 +87,6 @@ def vision_llm():
seed=0, seed=0,
) )
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
...@@ -158,7 +156,6 @@ def thinking_llm(): ...@@ -158,7 +156,6 @@ def thinking_llm():
seed=0, seed=0,
) )
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
......
...@@ -35,7 +35,6 @@ def llm(): ...@@ -35,7 +35,6 @@ def llm():
enforce_eager=True, enforce_eager=True,
seed=0) seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
......
...@@ -26,7 +26,6 @@ def llm(): ...@@ -26,7 +26,6 @@ def llm():
enforce_eager=True, enforce_eager=True,
seed=0) seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
......
...@@ -5,11 +5,9 @@ import weakref ...@@ -5,11 +5,9 @@ import weakref
import pytest import pytest
from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...models.utils import check_embeddings_close
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPTS = [ PROMPTS = [
...@@ -48,7 +46,6 @@ def llm(): ...@@ -48,7 +46,6 @@ def llm():
enforce_eager=True, enforce_eager=True,
seed=0) seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
...@@ -56,49 +53,6 @@ def llm(): ...@@ -56,49 +53,6 @@ def llm():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def assert_outputs_match(o1: list[PoolingRequestOutput],
o2: list[PoolingRequestOutput]):
check_embeddings_close(
embeddings_0_lst=[o.outputs.data for o in o1],
embeddings_1_lst=[o.outputs.data for o in o2],
name_0="hf",
name_1="vllm",
tol=1e-2,
)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
pooling_params=pooling_params)
v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
pooling_params=pooling_params)
assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
pooling_params=pooling_params)
v2_output = llm.encode(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
pooling_params=pooling_params,
)
assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_multiple_pooling_params(llm: LLM): def test_multiple_pooling_params(llm: LLM):
pooling_params = [ pooling_params = [
......
...@@ -5,7 +5,7 @@ import weakref ...@@ -5,7 +5,7 @@ import weakref
import pytest import pytest
from vllm import LLM, RequestOutput, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "distilbert/distilgpt2" MODEL_NAME = "distilbert/distilgpt2"
...@@ -41,7 +41,6 @@ def llm(): ...@@ -41,7 +41,6 @@ def llm():
gpu_memory_utilization=0.10, gpu_memory_utilization=0.10,
enforce_eager=True) enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
...@@ -49,42 +48,6 @@ def llm(): ...@@ -49,42 +48,6 @@ def llm():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params)
v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
sampling_params=sampling_params)
v2_output = llm.generate(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_multiple_sampling_params(llm: LLM): def test_multiple_sampling_params(llm: LLM):
sampling_params = [ sampling_params = [
......
...@@ -48,7 +48,6 @@ def llm(request, monkeypatch_module): ...@@ -48,7 +48,6 @@ def llm(request, monkeypatch_module):
max_num_seqs=128, max_num_seqs=128,
enforce_eager=True) enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
......
...@@ -36,7 +36,6 @@ def llm(): ...@@ -36,7 +36,6 @@ def llm():
trust_remote_code=True, trust_remote_code=True,
seed=0) seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
......
...@@ -33,7 +33,6 @@ def llm(): ...@@ -33,7 +33,6 @@ def llm():
enforce_eager=True, enforce_eager=True,
seed=0) seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment