Unverified Commit 1e4ecca1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `VLLM_USE_V1` from tests (#26341)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c0a7b89d
...@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated(): ...@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
) )
def test_structured_output( def test_structured_output(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any], sample_json_schema: dict[str, Any],
unsupported_json_schema: dict[str, Any], unsupported_json_schema: dict[str, Any],
sample_sql_ebnf: str, sample_sql_ebnf: str,
...@@ -115,8 +114,6 @@ def test_structured_output( ...@@ -115,8 +114,6 @@ def test_structured_output(
model_name: str, model_name: str,
speculative_config: dict[str, Any], speculative_config: dict[str, Any],
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config: if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding") pytest.skip("TPU does not support speculative decoding")
...@@ -620,15 +617,12 @@ Make the response as short as possible. ...@@ -620,15 +617,12 @@ Make the response as short as possible.
], ],
) )
def test_structured_output_with_reasoning_matrices( def test_structured_output_with_reasoning_matrices(
monkeypatch: pytest.MonkeyPatch,
backend: str, backend: str,
tokenizer_mode: TokenizerMode, tokenizer_mode: TokenizerMode,
reasoning_parser: str, reasoning_parser: str,
model_name: str, model_name: str,
speculative_config: dict[str, Any] | None, speculative_config: dict[str, Any] | None,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config: if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding") pytest.skip("TPU does not support speculative decoding")
...@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices( ...@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
def test_structured_output_auto_mode( def test_structured_output_auto_mode(
monkeypatch: pytest.MonkeyPatch,
unsupported_json_schema: dict[str, Any], unsupported_json_schema: dict[str, Any],
model_name: str, model_name: str,
tokenizer_mode: str, tokenizer_mode: str,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_model_len=1024, max_model_len=1024,
...@@ -739,9 +730,7 @@ def test_structured_output_auto_mode( ...@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): def test_guidance_no_additional_properties():
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct", model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=1024, max_model_len=1024,
...@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): ...@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
def test_structured_output_batched_with_non_structured_outputs_requests( def test_structured_output_batched_with_non_structured_outputs_requests(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any], sample_json_schema: dict[str, Any],
backend: str, backend: str,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
# Don't use eager execution on TPUs because we want to test for no # Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime # recompilation at runtime
enforce_eager = bool(not current_platform.is_tpu()) enforce_eager = bool(not current_platform.is_tpu())
......
...@@ -53,7 +53,6 @@ cleanup() { ...@@ -53,7 +53,6 @@ cleanup() {
launch_baseline() { launch_baseline() {
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME}; BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
...@@ -73,7 +72,6 @@ launch_pd() { ...@@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
...@@ -93,7 +91,6 @@ launch_pd() { ...@@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
......
...@@ -55,7 +55,6 @@ launch_pd() { ...@@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
...@@ -75,7 +74,6 @@ launch_pd() { ...@@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import ray import ray
...@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams ...@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch.setenv("VLLM_USE_V1", "1")
MODELS = [ MODELS = [
"distilbert/distilgpt2", "distilbert/distilgpt2",
] ]
...@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray( ...@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@ray.remote(num_gpus=1) @ray.remote(num_gpus=1)
class EngineTestActor: class EngineTestActor:
async def run(self): async def run(self):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os.environ["VLLM_USE_V1"] = "1"
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
) )
......
...@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs( ...@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition: BatchLogprobsComposition, batch_logprobs_composition: BatchLogprobsComposition,
temperature: float, temperature: float,
example_prompts: list[str], example_prompts: list[str],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test V1 Engine logprobs & prompt logprobs """Test V1 Engine logprobs & prompt logprobs
...@@ -308,12 +307,8 @@ def test_get_logprobs_and_prompt_logprobs( ...@@ -308,12 +307,8 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter temperature: "temperature" sampling parameter
example_prompts: example prompt fixture example_prompts: example prompt fixture
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
if do_apc and ( if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
):
# Skip some test-cases to save time. # Skip some test-cases to save time.
pytest.skip() pytest.skip()
test_prompts = example_prompts test_prompts = example_prompts
...@@ -361,14 +356,11 @@ def test_get_logprobs_and_prompt_logprobs( ...@@ -361,14 +356,11 @@ def test_get_logprobs_and_prompt_logprobs(
) )
def test_max_logprobs(monkeypatch: pytest.MonkeyPatch): def test_max_logprobs():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs` """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs` Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation. APC should not matter as this test checks basic request validation.
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
runner = VllmRunner( runner = VllmRunner(
"facebook/opt-125m", "facebook/opt-125m",
max_logprobs=1, max_logprobs=1,
...@@ -386,15 +378,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch): ...@@ -386,15 +378,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
runner.generate(["Hello world"], sampling_params=bad_sampling_params) runner.generate(["Hello world"], sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): def test_none_logprobs(vllm_model, example_prompts):
"""Engine should return `logprobs` and `prompt_logprobs` as `None` """Engine should return `logprobs` and `prompt_logprobs` as `None`
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5 max_tokens = 5
sampling_params_logprobs_none = SamplingParams( sampling_params_logprobs_none = SamplingParams(
...@@ -416,15 +406,13 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa ...@@ -416,15 +406,13 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
assert results_logprobs_none[i].prompt_logprobs is None assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): def test_zero_logprobs(vllm_model, example_prompts):
"""Engine should return sampled token and prompt token logprobs """Engine should return sampled token and prompt token logprobs
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5 max_tokens = 5
sampling_params_logprobs_zero = SamplingParams( sampling_params_logprobs_zero = SamplingParams(
...@@ -450,14 +438,12 @@ def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa ...@@ -450,14 +438,12 @@ def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
assert len(prompt_token_ids) == len(prompt_logprobs) assert len(prompt_token_ids) == len(prompt_logprobs)
def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): def test_all_logprobs(example_prompts):
"""Engine should return all vocabulary logprobs and prompt logprobs """Engine should return all vocabulary logprobs and prompt logprobs
Args: Args:
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
runner = VllmRunner( runner = VllmRunner(
"facebook/opt-125m", "facebook/opt-125m",
max_logprobs=-1, max_logprobs=-1,
...@@ -488,16 +474,13 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): ...@@ -488,16 +474,13 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode)) @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): def test_logprobs_mode(logprobs_mode: LogprobsMode):
"""Test with LLM engine with different logprobs_mode. """Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values. For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values. For logits, we should expect at least one positive values.
""" """
from vllm import LLM from vllm import LLM
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
"facebook/opt-125m", "facebook/opt-125m",
max_logprobs=5, max_logprobs=5,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
...@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm): ...@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000])) _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
def test_priority(llm):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, priority=[1])
def test_seed(llm): def test_seed(llm):
"""Check that seed impacts randomness.""" """Check that seed impacts randomness."""
......
...@@ -38,7 +38,6 @@ def test_eagle_max_len( ...@@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
......
...@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024] ...@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS) @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
def test_basic( def test_basic(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
model: str, model: str,
max_tokens: int, max_tokens: int,
tensor_parallel_size: int, tensor_parallel_size: int,
...@@ -55,9 +54,6 @@ def test_basic( ...@@ -55,9 +54,6 @@ def test_basic(
) )
example_prompts = [prompt] example_prompts = [prompt]
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
model, model,
# Note: max_num_batched_tokens == 1024 is needed here to # Note: max_num_batched_tokens == 1024 is needed here to
...@@ -82,7 +78,6 @@ def test_basic( ...@@ -82,7 +78,6 @@ def test_basic(
@pytest.mark.parametrize("max_num_seqs", [16]) @pytest.mark.parametrize("max_num_seqs", [16])
def test_phi3( def test_phi3(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
max_tokens: int, max_tokens: int,
max_num_seqs: int, max_num_seqs: int,
) -> None: ) -> None:
...@@ -99,9 +94,6 @@ def test_phi3( ...@@ -99,9 +94,6 @@ def test_phi3(
# test head dim = 96 # test head dim = 96
model = "microsoft/Phi-3-mini-128k-instruct" model = "microsoft/Phi-3-mini-128k-instruct"
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
) as vllm_model: ) as vllm_model:
...@@ -123,7 +115,6 @@ TP_SIZE_8 = 8 ...@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
) )
def test_gemma3_27b_with_text_input_and_tp( def test_gemma3_27b_with_text_input_and_tp(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
model = "google/gemma-3-27b-it" model = "google/gemma-3-27b-it"
max_tokens = 16 max_tokens = 16
...@@ -140,9 +131,6 @@ def test_gemma3_27b_with_text_input_and_tp( ...@@ -140,9 +131,6 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall.", " but in rising every time we fall.",
] ]
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
model, model,
max_num_batched_tokens=256, max_num_batched_tokens=256,
...@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp( ...@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
) )
def test_w8a8_quantization( def test_w8a8_quantization(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8" model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens = 5 max_tokens = 5
...@@ -176,9 +163,6 @@ def test_w8a8_quantization( ...@@ -176,9 +163,6 @@ def test_w8a8_quantization(
) )
example_prompts = [prompt] example_prompts = [prompt]
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
model, model,
max_num_batched_tokens=64, max_num_batched_tokens=64,
......
...@@ -86,7 +86,6 @@ GPU_UTIL = 0.9 ...@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@pytest.mark.parametrize("params", TEST_PARAMS) @pytest.mark.parametrize("params", TEST_PARAMS)
def test_perf( def test_perf(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
params: TestParams, params: TestParams,
) -> None: ) -> None:
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
...@@ -107,9 +106,6 @@ def test_perf( ...@@ -107,9 +106,6 @@ def test_perf(
) )
) )
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams( sampling_params = SamplingParams(
max_tokens=params.decode_len, temperature=1.0, min_p=0.0 max_tokens=params.decode_len, temperature=1.0, min_p=0.0
) )
......
...@@ -82,7 +82,7 @@ def test_traces( ...@@ -82,7 +82,7 @@ def test_traces(
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.01, temperature=0.01,
top_p=0.1, top_p=0.1,
......
...@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner): ...@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger.info("Warming up model for the compilation...") logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape # Only generate graph for the generic shape
with _set_global_compilation_settings(self.vllm_config): with _set_global_compilation_settings(self.vllm_config):
self._dummy_run(max(16, self.max_num_reqs)) self._dummy_run(
min(
max(16, self.max_num_reqs),
self.scheduler_config.max_num_batched_tokens,
)
)
logger.info("Warming up done.") logger.info("Warming up done.")
def _init_device_properties(self) -> None: def _init_device_properties(self) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment