Unverified Commit 8896eb72 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation] Remove `prompt_token_ids` arg fallback in `LLM.generate` and `LLM.embed` (#18800)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 19fe1a05
......@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
......
......@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
......
......@@ -71,7 +71,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
RUN cd /vllm-workspace \
&& rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4 \
&& python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
&& python3 -m pip install pytest-shard
# -----------------------
......
......@@ -79,7 +79,7 @@ Since simple RTN does not require data for weight quantization and the activatio
Install `vllm` and `lm-evaluation-harness` for evaluation:
```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```
Load and run the model in `vllm`:
......
......@@ -18,7 +18,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```
## Quantization Process
......
......@@ -19,7 +19,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```
## Quantization Process
......
......@@ -20,7 +20,7 @@ for more installation details.
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```
## Quantization Process
......
......@@ -5,6 +5,7 @@ from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.benchmarks.datasets import add_dataset_parser, get_samples
from vllm.inputs import TokensPrompt
from vllm.v1.metrics.reader import Counter, Vector
try:
......@@ -137,7 +138,8 @@ def main():
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
if not args.custom_mm_prompts:
outputs = llm.generate(
prompt_token_ids=prompt_ids, sampling_params=sampling_params
TokensPrompt(prompt_token_ids=prompt_ids),
sampling_params=sampling_params,
)
else:
outputs = llm.chat(prompts, sampling_params=sampling_params)
......
......@@ -85,7 +85,7 @@ def format_output(title: str, output: str):
def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
outputs = llm.generate(prompt, sampling_params=sampling_params)
return outputs[0].outputs[0].text
......
......@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.2 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
......
......@@ -32,7 +32,8 @@ num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
# TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.55.2
tokenizers==0.21.1
......
......@@ -408,7 +408,7 @@ lightning-utilities==0.14.3
# torchmetrics
llvmlite==0.44.0
# via numba
lm-eval==0.4.8
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# via -r requirements/test.in
lxml==5.3.0
# via
......
......@@ -18,10 +18,9 @@ def text_llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......@@ -88,10 +87,9 @@ def vision_llm():
seed=0,
)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......@@ -158,10 +156,9 @@ def thinking_llm():
seed=0,
)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -35,10 +35,9 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -26,10 +26,9 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -5,11 +5,9 @@ import weakref
import pytest
from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from ...models.utils import check_embeddings_close
MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPTS = [
......@@ -48,57 +46,13 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
def assert_outputs_match(o1: list[PoolingRequestOutput],
o2: list[PoolingRequestOutput]):
check_embeddings_close(
embeddings_0_lst=[o.outputs.data for o in o1],
embeddings_1_lst=[o.outputs.data for o in o2],
name_0="hf",
name_1="vllm",
tol=1e-2,
)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
pooling_params=pooling_params)
v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
pooling_params=pooling_params)
assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
pooling_params=pooling_params)
v2_output = llm.encode(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
pooling_params=pooling_params,
)
assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_multiple_pooling_params(llm: LLM):
pooling_params = [
......
......@@ -5,7 +5,7 @@ import weakref
import pytest
from vllm import LLM, RequestOutput, SamplingParams
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "distilbert/distilgpt2"
......@@ -41,50 +41,13 @@ def llm():
gpu_memory_utilization=0.10,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params)
v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
sampling_params=sampling_params)
v2_output = llm.generate(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_multiple_sampling_params(llm: LLM):
sampling_params = [
......
......@@ -48,10 +48,9 @@ def llm(request, monkeypatch_module):
max_num_seqs=128,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -36,10 +36,9 @@ def llm():
trust_remote_code=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -33,10 +33,9 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment