Unverified Commit 1e4ecca1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c0a7b89d
...@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated(): ...@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
) )
def test_structured_output( def test_structured_output(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any], sample_json_schema: dict[str, Any],
unsupported_json_schema: dict[str, Any], unsupported_json_schema: dict[str, Any],
sample_sql_ebnf: str, sample_sql_ebnf: str,
...@@ -115,8 +114,6 @@ def test_structured_output( ...@@ -115,8 +114,6 @@ def test_structured_output(
model_name: str, model_name: str,
speculative_config: dict[str, Any], speculative_config: dict[str, Any],
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config: if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding") pytest.skip("TPU does not support speculative decoding")
...@@ -620,15 +617,12 @@ Make the response as short as possible. ...@@ -620,15 +617,12 @@ Make the response as short as possible.
], ],
) )
def test_structured_output_with_reasoning_matrices( def test_structured_output_with_reasoning_matrices(
monkeypatch: pytest.MonkeyPatch,
backend: str, backend: str,
tokenizer_mode: TokenizerMode, tokenizer_mode: TokenizerMode,
reasoning_parser: str, reasoning_parser: str,
model_name: str, model_name: str,
speculative_config: dict[str, Any] | None, speculative_config: dict[str, Any] | None,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config: if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding") pytest.skip("TPU does not support speculative decoding")
...@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices( ...@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
def test_structured_output_auto_mode( def test_structured_output_auto_mode(
monkeypatch: pytest.MonkeyPatch,
unsupported_json_schema: dict[str, Any], unsupported_json_schema: dict[str, Any],
model_name: str, model_name: str,
tokenizer_mode: str, tokenizer_mode: str,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_model_len=1024, max_model_len=1024,
...@@ -739,9 +730,7 @@ def test_structured_output_auto_mode( ...@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): def test_guidance_no_additional_properties():
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct", model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=1024, max_model_len=1024,
...@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): ...@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
def test_structured_output_batched_with_non_structured_outputs_requests( def test_structured_output_batched_with_non_structured_outputs_requests(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any], sample_json_schema: dict[str, Any],
backend: str, backend: str,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
# Don't use eager execution on TPUs because we want to test for no # Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime # recompilation at runtime
enforce_eager = bool(not current_platform.is_tpu()) enforce_eager = bool(not current_platform.is_tpu())
......
...@@ -53,7 +53,6 @@ cleanup() { ...@@ -53,7 +53,6 @@ cleanup() {
launch_baseline() { launch_baseline() {
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME}; BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
...@@ -73,7 +72,6 @@ launch_pd() { ...@@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
...@@ -93,7 +91,6 @@ launch_pd() { ...@@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
......
...@@ -55,7 +55,6 @@ launch_pd() { ...@@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
...@@ -75,7 +74,6 @@ launch_pd() { ...@@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import ray import ray
...@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams ...@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch.setenv("VLLM_USE_V1", "1")
MODELS = [ MODELS = [
"distilbert/distilgpt2", "distilbert/distilgpt2",
] ]
...@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray( ...@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@ray.remote(num_gpus=1) @ray.remote(num_gpus=1)
class EngineTestActor: class EngineTestActor:
async def run(self): async def run(self):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os.environ["VLLM_USE_V1"] = "1"
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
) )
......
...@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs( ...@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition: BatchLogprobsComposition, batch_logprobs_composition: BatchLogprobsComposition,
temperature: float, temperature: float,
example_prompts: list[str], example_prompts: list[str],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test V1 Engine logprobs & prompt logprobs """Test V1 Engine logprobs & prompt logprobs
...@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs( ...@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter temperature: "temperature" sampling parameter
example_prompts: example prompt fixture example_prompts: example prompt fixture
""" """
with monkeypatch.context() as m: do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
m.setenv("VLLM_USE_V1", "1") if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching # Skip some test-cases to save time.
if do_apc and ( pytest.skip()
temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT test_prompts = example_prompts
):
# Skip some test-cases to save time. max_tokens = 5
pytest.skip() hf_outputs = hf_model.generate_greedy(
test_prompts = example_prompts test_prompts,
max_tokens=max_tokens,
max_tokens = 5 )
hf_outputs = hf_model.generate_greedy( hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts, test_prompts,
max_tokens=max_tokens,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(
max_tokens=max_tokens, max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984,
) )
hf_logprobs = hf_model.generate_greedy_logprobs( for num_lp, num_plp in logprob_prompt_logprob_list
test_prompts, ]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens, max_tokens=max_tokens,
do_apc=do_apc,
) )
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing def test_max_logprobs():
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(
max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984,
)
for num_lp, num_plp in logprob_prompt_logprob_list
]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens,
do_apc=do_apc,
)
def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs` """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs` Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation. APC should not matter as this test checks basic request validation.
""" """
with monkeypatch.context() as m: runner = VllmRunner(
m.setenv("VLLM_USE_V1", "1") "facebook/opt-125m",
max_logprobs=1,
runner = VllmRunner( enable_prefix_caching=False,
"facebook/opt-125m", # 2 other llms alive during whole session
max_logprobs=1, gpu_memory_utilization=0.15,
enable_prefix_caching=False, max_model_len=256,
# 2 other llms alive during whole session )
gpu_memory_utilization=0.15, vllm_sampling_params = SamplingParams(logprobs=1)
max_model_len=256, # should pass
) runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2) bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError): with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params) runner.generate(["Hello world"], sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): def test_none_logprobs(vllm_model, example_prompts):
"""Engine should return `logprobs` and `prompt_logprobs` as `None` """Engine should return `logprobs` and `prompt_logprobs` as `None`
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m: max_tokens = 5
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
sampling_params_logprobs_none = SamplingParams( sampling_params_logprobs_none = SamplingParams(
max_tokens=max_tokens, max_tokens=max_tokens,
logprobs=None, logprobs=None,
prompt_logprobs=None, prompt_logprobs=None,
temperature=0.0, temperature=0.0,
) )
results_logprobs_none = vllm_model.llm.generate( results_logprobs_none = vllm_model.llm.generate(
example_prompts, example_prompts,
sampling_params=sampling_params_logprobs_none, sampling_params=sampling_params_logprobs_none,
) )
for i in range(len(results_logprobs_none)): for i in range(len(results_logprobs_none)):
# Check sample logprobs are None # Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
# Check prompt logprobs are None # Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): def test_zero_logprobs(vllm_model, example_prompts):
"""Engine should return sampled token and prompt token logprobs """Engine should return sampled token and prompt token logprobs
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m: max_tokens = 5
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
sampling_params_logprobs_zero = SamplingParams( sampling_params_logprobs_zero = SamplingParams(
max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0 max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
) )
results_logprobs_zero = vllm_model.llm.generate( results_logprobs_zero = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero example_prompts, sampling_params=sampling_params_logprobs_zero
) )
for i in range(len(results_logprobs_zero)): for i in range(len(results_logprobs_zero)):
# Check that there is one sample logprob dict for each # Check that there is one sample logprob dict for each
# sample token # sample token
logprobs = results_logprobs_zero[i].outputs[0].logprobs logprobs = results_logprobs_zero[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
assert logprobs is not None assert logprobs is not None
assert len(sampled_token_ids) == len(logprobs) assert len(sampled_token_ids) == len(logprobs)
assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
# Check that there is one prompt logprob dict for each # Check that there is one prompt logprob dict for each
# prompt token # prompt token
assert prompt_logprobs is not None assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs) assert len(prompt_token_ids) == len(prompt_logprobs)
def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): def test_all_logprobs(example_prompts):
"""Engine should return all vocabulary logprobs and prompt logprobs """Engine should return all vocabulary logprobs and prompt logprobs
Args: Args:
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m: runner = VllmRunner(
m.setenv("VLLM_USE_V1", "1") "facebook/opt-125m",
runner = VllmRunner( max_logprobs=-1,
"facebook/opt-125m", enable_prefix_caching=False,
max_logprobs=-1, # 2 other llms alive during whole session
enable_prefix_caching=False, gpu_memory_utilization=0.15,
# 2 other llms alive during whole session max_model_len=256,
gpu_memory_utilization=0.15, )
max_model_len=256,
)
sampling_params_logprobs_all = SamplingParams( sampling_params_logprobs_all = SamplingParams(
max_tokens=5, logprobs=-1, prompt_logprobs=-1 max_tokens=5, logprobs=-1, prompt_logprobs=-1
) )
results_logprobs_all = runner.llm.generate( results_logprobs_all = runner.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_all example_prompts, sampling_params=sampling_params_logprobs_all
) )
vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size() vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
for i in range(len(results_logprobs_all)): for i in range(len(results_logprobs_all)):
logprobs = results_logprobs_all[i].outputs[0].logprobs logprobs = results_logprobs_all[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_all[i].prompt_logprobs prompt_logprobs = results_logprobs_all[i].prompt_logprobs
assert logprobs is not None assert logprobs is not None
for logprob in logprobs: for logprob in logprobs:
assert len(logprob) == vocab_size assert len(logprob) == vocab_size
assert prompt_logprobs is not None assert prompt_logprobs is not None
assert prompt_logprobs[0] is None assert prompt_logprobs[0] is None
for prompt_logprob in prompt_logprobs[1:]: for prompt_logprob in prompt_logprobs[1:]:
assert len(prompt_logprob) == vocab_size assert len(prompt_logprob) == vocab_size
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode)) @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): def test_logprobs_mode(logprobs_mode: LogprobsMode):
"""Test with LLM engine with different logprobs_mode. """Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values. For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values. For logits, we should expect at least one positive values.
""" """
from vllm import LLM from vllm import LLM
with monkeypatch.context() as m: llm = LLM(
m.setenv("VLLM_USE_V1", "1") "facebook/opt-125m",
max_logprobs=5,
llm = LLM( enable_prefix_caching=False,
"facebook/opt-125m", # 2 other llms alive during whole session
max_logprobs=5, gpu_memory_utilization=0.05,
enable_prefix_caching=False, max_model_len=16,
# 2 other llms alive during whole session logprobs_mode=logprobs_mode,
gpu_memory_utilization=0.05, )
max_model_len=16, vllm_sampling_params = SamplingParams(logprobs=1)
logprobs_mode=logprobs_mode, results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
)
vllm_sampling_params = SamplingParams(logprobs=1) total_token_with_logprobs = 0
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params) positive_values = 0
for output in results[0].outputs:
total_token_with_logprobs = 0 for logprobs in output.logprobs:
positive_values = 0 for token_id in logprobs:
for output in results[0].outputs: logprob = logprobs[token_id]
for logprobs in output.logprobs: if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
for token_id in logprobs: assert logprob.logprob <= 0
logprob = logprobs[token_id] if logprob.logprob > 0:
if logprobs_mode in ("raw_logprobs", "processed_logprobs"): positive_values = positive_values + 1
assert logprob.logprob <= 0 total_token_with_logprobs = total_token_with_logprobs + 1
if logprob.logprob > 0: assert total_token_with_logprobs >= len(results[0].outputs)
positive_values = positive_values + 1 if logprobs_mode in ("raw_logits", "processed_logits"):
total_token_with_logprobs = total_token_with_logprobs + 1 assert positive_values > 0
assert total_token_with_logprobs >= len(results[0].outputs) del llm
if logprobs_mode in ("raw_logits", "processed_logits"):
assert positive_values > 0
del llm
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
...@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm): ...@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000])) _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
def test_priority(llm):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, priority=[1])
def test_seed(llm): def test_seed(llm):
"""Check that seed impacts randomness.""" """Check that seed impacts randomness."""
......
...@@ -38,7 +38,6 @@ def test_eagle_max_len( ...@@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
......
...@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024] ...@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS) @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
def test_basic( def test_basic(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
model: str, model: str,
max_tokens: int, max_tokens: int,
tensor_parallel_size: int, tensor_parallel_size: int,
...@@ -55,23 +54,20 @@ def test_basic( ...@@ -55,23 +54,20 @@ def test_basic(
) )
example_prompts = [prompt] example_prompts = [prompt]
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8192,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
with vllm_runner( assert "1024" in output or "0, 1" in output
model,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8192,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
assert "1024" in output or "0, 1" in output
@pytest.mark.skip(reason="Temporarily disabled due to timeout") @pytest.mark.skip(reason="Temporarily disabled due to timeout")
...@@ -82,7 +78,6 @@ def test_basic( ...@@ -82,7 +78,6 @@ def test_basic(
@pytest.mark.parametrize("max_num_seqs", [16]) @pytest.mark.parametrize("max_num_seqs", [16])
def test_phi3( def test_phi3(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
max_tokens: int, max_tokens: int,
max_num_seqs: int, max_num_seqs: int,
) -> None: ) -> None:
...@@ -99,18 +94,15 @@ def test_phi3( ...@@ -99,18 +94,15 @@ def test_phi3(
# test head dim = 96 # test head dim = 96
model = "microsoft/Phi-3-mini-128k-instruct" model = "microsoft/Phi-3-mini-128k-instruct"
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
) as vllm_model:
with vllm_runner( vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs # vllm_outputs is a list of tuples whose first element is the token id
) as vllm_model: # and the second element is the output (including the prompt).
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) for output, answer in zip(vllm_outputs, answers):
# vllm_outputs is a list of tuples whose first element is the token id generated_text = output[1]
# and the second element is the output (including the prompt). assert answer in generated_text
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
TP_SIZE_8 = 8 TP_SIZE_8 = 8
...@@ -123,7 +115,6 @@ TP_SIZE_8 = 8 ...@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
) )
def test_gemma3_27b_with_text_input_and_tp( def test_gemma3_27b_with_text_input_and_tp(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
model = "google/gemma-3-27b-it" model = "google/gemma-3-27b-it"
max_tokens = 16 max_tokens = 16
...@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp( ...@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall.", " but in rising every time we fall.",
] ]
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model,
max_num_batched_tokens=256,
with vllm_runner( max_num_seqs=max_num_seqs,
model, tensor_parallel_size=tensor_parallel_size,
max_num_batched_tokens=256, ) as vllm_model:
max_num_seqs=max_num_seqs, vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
tensor_parallel_size=tensor_parallel_size, # vllm_outputs is a list of tuples whose first element is the token id
) as vllm_model: # and the second element is the output (including the prompt).
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) for output, answer in zip(vllm_outputs, answers):
# vllm_outputs is a list of tuples whose first element is the token id generated_text = output[1]
# and the second element is the output (including the prompt). assert answer in generated_text
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
@pytest.mark.skipif( @pytest.mark.skipif(
...@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp( ...@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
) )
def test_w8a8_quantization( def test_w8a8_quantization(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8" model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens = 5 max_tokens = 5
...@@ -176,18 +163,15 @@ def test_w8a8_quantization( ...@@ -176,18 +163,15 @@ def test_w8a8_quantization(
) )
example_prompts = [prompt] example_prompts = [prompt]
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model,
max_num_batched_tokens=64,
with vllm_runner( max_model_len=4096,
model, gpu_memory_utilization=0.7,
max_num_batched_tokens=64, max_num_seqs=max_num_seqs,
max_model_len=4096, tensor_parallel_size=tensor_parallel_size,
gpu_memory_utilization=0.7, ) as vllm_model:
max_num_seqs=max_num_seqs, vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tensor_parallel_size=tensor_parallel_size, output = vllm_outputs[0][1]
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) assert "1024" in output or "0, 1" in output
output = vllm_outputs[0][1]
assert "1024" in output or "0, 1" in output
...@@ -86,7 +86,6 @@ GPU_UTIL = 0.9 ...@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@pytest.mark.parametrize("params", TEST_PARAMS) @pytest.mark.parametrize("params", TEST_PARAMS)
def test_perf( def test_perf(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
params: TestParams, params: TestParams,
) -> None: ) -> None:
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
...@@ -107,48 +106,45 @@ def test_perf( ...@@ -107,48 +106,45 @@ def test_perf(
) )
) )
with monkeypatch.context() as m: sampling_params = SamplingParams(
m.setenv("VLLM_USE_V1", "1") max_tokens=params.decode_len, temperature=1.0, min_p=0.0
)
sampling_params = SamplingParams( with vllm_runner(
max_tokens=params.decode_len, temperature=1.0, min_p=0.0 params.model,
max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
print(" -- Benchmarking... ")
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
avg_time = sum(times) / len(times)
print(" -- avg_time = {}".format(avg_time))
print(
" -- expected_avg_time = {} with err_tol = {}".format(
params.expected_avg_time, params.err_tol
)
) )
diff = avg_time - params.expected_avg_time
with vllm_runner( ok = diff < params.err_tol
params.model, if diff < -params.err_tol:
max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
print(" -- Benchmarking... ")
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
avg_time = sum(times) / len(times)
print(" -- avg_time = {}".format(avg_time))
print( print(
" -- expected_avg_time = {} with err_tol = {}".format( " !! WARNING !! Performance has improved by {}, "
params.expected_avg_time, params.err_tol "it may be necessary to fine-tune the "
) "expected_avg_time = {}".format(-diff, params.expected_avg_time)
) )
diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol assert ok, " !! ERROR !! Regression detected"
if diff < -params.err_tol:
print(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
)
assert ok, " !! ERROR !! Regression detected"
...@@ -82,7 +82,7 @@ def test_traces( ...@@ -82,7 +82,7 @@ def test_traces(
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.01, temperature=0.01,
top_p=0.1, top_p=0.1,
......
...@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner): ...@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger.info("Warming up model for the compilation...") logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape # Only generate graph for the generic shape
with _set_global_compilation_settings(self.vllm_config): with _set_global_compilation_settings(self.vllm_config):
self._dummy_run(max(16, self.max_num_reqs)) self._dummy_run(
min(
max(16, self.max_num_reqs),
self.scheduler_config.max_num_batched_tokens,
)
)
logger.info("Warming up done.") logger.info("Warming up done.")
def _init_device_properties(self) -> None: def _init_device_properties(self) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment