Unverified Commit d4d93db2 authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[V1] V1 Enablement Oracle (#13726)


Signed-off-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: default avatarMichael Goin <michael@neuralmagic.com>
parent 8c0d15d5
...@@ -4,8 +4,8 @@ tasks: ...@@ -4,8 +4,8 @@ tasks:
- name: "gsm8k" - name: "gsm8k"
metrics: metrics:
- name: "exact_match,strict-match" - name: "exact_match,strict-match"
value: 0.233 value: 0.231
- name: "exact_match,flexible-extract" - name: "exact_match,flexible-extract"
value: 0.236 value: 0.22
limit: 1000 limit: 1000
num_fewshot: 5 num_fewshot: 5
...@@ -13,6 +13,7 @@ from pathlib import Path ...@@ -13,6 +13,7 @@ from pathlib import Path
import lm_eval import lm_eval
import numpy import numpy
import pytest
import yaml import yaml
RTOL = 0.05 RTOL = 0.05
...@@ -46,6 +47,10 @@ def test_lm_eval_correctness(): ...@@ -46,6 +47,10 @@ def test_lm_eval_correctness():
eval_config = yaml.safe_load( eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8")) Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
# Launch eval requests. # Launch eval requests.
results = launch_lm_eval(eval_config) results = launch_lm_eval(eval_config)
......
...@@ -117,10 +117,10 @@ steps: ...@@ -117,10 +117,10 @@ steps:
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
...@@ -136,7 +136,7 @@ steps: ...@@ -136,7 +136,7 @@ steps:
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
- VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py - python3 ../examples/offline_inference/data_parallel.py
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
...@@ -197,16 +197,17 @@ steps: ...@@ -197,16 +197,17 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- VLLM_USE_V1=1 pytest -v -s v1/core - pytest -v -s v1/core
- VLLM_USE_V1=1 pytest -v -s v1/engine - pytest -v -s v1/engine
- VLLM_USE_V1=1 pytest -v -s v1/sample - pytest -v -s v1/sample
- VLLM_USE_V1=1 pytest -v -s v1/worker - pytest -v -s v1/worker
- VLLM_USE_V1=1 pytest -v -s v1/structured_output - pytest -v -s v1/structured_output
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py - pytest -v -s v1/test_stats.py
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py - pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- VLLM_USE_V1=1 pytest -v -s v1/e2e - pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch). # Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
...@@ -226,12 +227,12 @@ steps: ...@@ -226,12 +227,12 @@ steps:
- python3 offline_inference/llm_engine_example.py - python3 offline_inference/llm_engine_example.py
- python3 offline_inference/vision_language.py - python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language_multi_image.py - python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min - label: Prefix Caching Test # 9min
mirror_hardwares: [amd] mirror_hardwares: [amd]
...@@ -375,7 +376,8 @@ steps: ...@@ -375,7 +376,8 @@ steps:
commands: commands:
- pytest -v -s models/test_transformers.py - pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py # V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 32min - label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
...@@ -518,8 +520,8 @@ steps: ...@@ -518,8 +520,8 @@ steps:
# this test fails consistently. # this test fails consistently.
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import subprocess import subprocess
import sys import sys
import time import time
...@@ -44,7 +45,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): ...@@ -44,7 +45,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
distributed_executor_backend, distributed_executor_backend,
] ]
uvicorn_process = subprocess.Popen(commands) # API Server Test Requires V0.
my_env = os.environ.copy()
my_env["VLLM_USE_V1"] = "0"
uvicorn_process = subprocess.Popen(commands, env=my_env)
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()
......
...@@ -151,6 +151,10 @@ def uid() -> str: ...@@ -151,6 +151,10 @@ def uid() -> str:
@pytest_asyncio.fixture(scope="module") @pytest_asyncio.fixture(scope="module")
async def async_engine(): async def async_engine():
# We cannot use monkeypatch since this is a module
# scoped fixture and monkeypatch is function scoped.
previous_value = os.getenv("VLLM_USE_V1", None)
os.environ["VLLM_USE_V1"] = "0"
engine = await asyncio.get_event_loop().run_in_executor(executor=None, engine = await asyncio.get_event_loop().run_in_executor(executor=None,
func=start_engine) func=start_engine)
try: try:
...@@ -161,6 +165,11 @@ async def async_engine(): ...@@ -161,6 +165,11 @@ async def async_engine():
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
if previous_value:
os.environ["VLLM_USE_V1"] = previous_value
else:
del os.environ["VLLM_USE_V1"]
@pytest.fixture() @pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool: def should_do_global_cleanup_after_test(request) -> bool:
......
...@@ -23,6 +23,15 @@ MODELS = [ ...@@ -23,6 +23,15 @@ MODELS = [
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import pytest
from ..utils import compare_two_settings from ..utils import compare_two_settings
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
monkeypatch.setenv('VLLM_USE_V1', '0')
def test_cpu_offload(): def test_cpu_offload():
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [], compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
["--cpu-offload-gb", "1"]) ["--cpu-offload-gb", "1"])
...@@ -21,6 +21,15 @@ MODELS = [ ...@@ -21,6 +21,15 @@ MODELS = [
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
so use VLLM_USE_V1=0 for all tests in the file.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(scope="module", autouse=True) @pytest.fixture(scope="module", autouse=True)
def check_settings(): def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, ( assert ENABLE_ARTIFICIAL_PREEMPT is True, (
......
# SPDX-License-Identifier: Apache-2.0
import pytest
# TEST V1: this should be removed. Right now V1 overrides
# all the torch compile logic. We should re-enable this
# as we add torch compile support back to V1.
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
...@@ -111,6 +111,26 @@ VIDEO_ASSETS = _VideoAssets() ...@@ -111,6 +111,26 @@ VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`.""" """Singleton instance of :class:`_VideoAssets`."""
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
"""
The V1 oracle sets "VLLM_USE_V1" during loading. This means
that each invocation of a test change the env variable.
If we touch "VLLM_USE_V1" with monkeypatch, then any changes
made during the test run by vLLM will be cleaned up.
This fixture is used by every test.
"""
# If VLLM_USE_V1 is not set, set then delete. This will
# cause monkeypatch to clean up VLLM_USE_V1 upon exit
# if VLLM modifies the value of envs.VLLM_USE_V1.
if "VLLM_USE_V1" not in os.environ:
monkeypatch.setenv("VLLM_USE_V1", "")
monkeypatch.delenv("VLLM_USE_V1")
@pytest.fixture(params=[True, False]) @pytest.fixture(params=[True, False])
def run_with_both_engines(request, monkeypatch): def run_with_both_engines(request, monkeypatch):
# Automatically runs tests twice, once with V1 and once without # Automatically runs tests twice, once with V1 and once without
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
...@@ -6,6 +6,7 @@ from vllm.entrypoints.llm import LLM ...@@ -6,6 +6,7 @@ from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_computed_prefix_blocks(model: str): def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and # This test checks if the engine generates completions both with and
......
...@@ -4,162 +4,138 @@ from typing import Any, Optional ...@@ -4,162 +4,138 @@ from typing import Any, Optional
import pytest import pytest
from vllm import CompletionOutput, LLMEngine, SamplingParams from vllm import LLM, SamplingParams, envs
MODEL = "meta-llama/llama-2-7b-hf" MODEL = "meta-llama/llama-2-7b-hf"
MAX_TOKENS = 200 MAX_TOKENS = 200
IS_ASYNC = False
def _test_stopping(llm: LLM,
@pytest.fixture(scope="session")
def vllm_model(vllm_runner):
with vllm_runner(MODEL) as vllm_model:
yield vllm_model
def _test_stopping(llm_engine: LLMEngine,
expected_output: str, expected_output: str,
expected_reason: Any, expected_reason: Any,
stop: Optional[list[str]] = None, stop: Optional[list[str]] = None,
stop_token_ids: Optional[list[int]] = None, stop_token_ids: Optional[list[int]] = None,
include_in_output: bool = False, include_in_output: bool = False) -> None:
use_async_output_proc: bool = False) -> None: output = llm.generate(
llm_engine.add_request( "A story about vLLM:\n",
"id", "A story about vLLM:\n",
SamplingParams( SamplingParams(
temperature=0.0, temperature=0.0,
max_tokens=MAX_TOKENS, max_tokens=MAX_TOKENS,
stop=stop, stop=stop,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
include_stop_str_in_output=include_in_output, include_stop_str_in_output=include_in_output,
), None) ))[0].outputs[0]
output: Optional[CompletionOutput] = None
output_text = ""
stop_reason = None
if use_async_output_proc:
llm_engine.step()
while llm_engine.has_unfinished_requests():
(request_output, ) = llm_engine.step()
(output, ) = request_output.outputs
# Ensure we don't backtrack
assert output.text.startswith(output_text)
output_text = output.text
stop_reason = output.stop_reason
assert output is not None assert output is not None
assert output_text == expected_output assert output.text == expected_output
assert stop_reason == expected_reason assert output.stop_reason == expected_reason
def _set_async_mode(llm_engine, is_async): def _set_async_mode(llm, is_async):
llm_engine.scheduler[0].use_async_output_proc = is_async llm.llm_engine.scheduler[0].use_async_output_proc = is_async
def _stop_basic(llm_engine, is_async): def _stop_basic(llm):
_test_stopping(llm_engine, _test_stopping(llm,
stop=["."], stop=["."],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer organization", expected_output="VLLM is a 100% volunteer organization",
expected_reason=".", expected_reason=".")
use_async_output_proc=is_async)
_test_stopping(llm_engine, _test_stopping(llm,
stop=["."], stop=["."],
include_in_output=True, include_in_output=True,
expected_output="VLLM is a 100% volunteer organization.", expected_output="VLLM is a 100% volunteer organization.",
expected_reason=".", expected_reason=".")
use_async_output_proc=is_async)
def _stop_multi_tokens(llm_engine, is_async): def _stop_multi_tokens(llm):
_test_stopping( _test_stopping(
llm_engine, llm,
stop=["group of peo", "short"], stop=["group of peo", "short"],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer organization. We are a ", expected_output="VLLM is a 100% volunteer organization. We are a ",
expected_reason="group of peo", expected_reason="group of peo")
use_async_output_proc=is_async)
_test_stopping( _test_stopping(
llm_engine, llm,
stop=["group of peo", "short"], stop=["group of peo", "short"],
include_in_output=True, include_in_output=True,
expected_output= expected_output=
"VLLM is a 100% volunteer organization. We are a group of peo", "VLLM is a 100% volunteer organization. We are a group of peo",
expected_reason="group of peo", expected_reason="group of peo")
use_async_output_proc=is_async)
def _stop_partial_token(llm_engine, is_async): def _stop_partial_token(llm):
_test_stopping(llm_engine, _test_stopping(llm,
stop=["gani"], stop=["gani"],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer or", expected_output="VLLM is a 100% volunteer or",
expected_reason="gani", expected_reason="gani")
use_async_output_proc=is_async)
_test_stopping(llm_engine, _test_stopping(llm,
stop=["gani"], stop=["gani"],
include_in_output=True, include_in_output=True,
expected_output="VLLM is a 100% volunteer organi", expected_output="VLLM is a 100% volunteer organi",
expected_reason="gani", expected_reason="gani")
use_async_output_proc=is_async)
def _stop_token_id(llm_engine, is_async): def _stop_token_id(llm):
# token id 13013 => " organization" # token id 13013 => " organization"
_test_stopping(llm_engine, _test_stopping(llm,
stop_token_ids=[13013], stop_token_ids=[13013],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer", expected_output="VLLM is a 100% volunteer",
expected_reason=13013, expected_reason=13013)
use_async_output_proc=is_async)
_test_stopping(llm_engine, _test_stopping(llm,
stop_token_ids=[13013], stop_token_ids=[13013],
include_in_output=True, include_in_output=True,
expected_output="VLLM is a 100% volunteer organization", expected_output="VLLM is a 100% volunteer organization",
expected_reason=13013, expected_reason=13013)
use_async_output_proc=is_async)
@pytest.mark.skip_global_cleanup
def test_stop_basic(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_basic(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_basic(vllm_model.model.llm_engine, is_async=False)
@pytest.mark.skip_global_cleanup
def test_stop_multi_tokens(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_stop_partial_token(vllm_model): def test_stop_strings():
_set_async_mode(vllm_model.model.llm_engine, True) # If V0, must set enforce_eager=False since we use
_stop_partial_token(vllm_model.model.llm_engine, is_async=True) # async output processing below.
vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_partial_token(vllm_model.model.llm_engine, is_async=False) if envs.VLLM_USE_V1:
_stop_basic(vllm_model)
else:
@pytest.mark.skip_global_cleanup _set_async_mode(vllm_model, True)
def test_stop_token_id(vllm_model): _stop_basic(vllm_model)
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_token_id(vllm_model.model.llm_engine, is_async=True) _set_async_mode(vllm_model, False)
_stop_basic(vllm_model)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_token_id(vllm_model.model.llm_engine, is_async=False) if envs.VLLM_USE_V1:
_stop_multi_tokens(vllm_model)
else:
_set_async_mode(vllm_model, True)
_stop_multi_tokens(vllm_model)
_set_async_mode(vllm_model, False)
_stop_multi_tokens(vllm_model)
if envs.VLLM_USE_V1:
_stop_partial_token(vllm_model)
else:
_set_async_mode(vllm_model, True)
_stop_partial_token(vllm_model)
_set_async_mode(vllm_model, False)
_stop_partial_token(vllm_model)
if envs.VLLM_USE_V1:
# FIXME: this does not respect include_in_output=False
# _stop_token_id(vllm_model)
pass
else:
_set_async_mode(vllm_model, True)
_stop_token_id(vllm_model)
_set_async_mode(vllm_model, False)
_stop_token_id(vllm_model)
...@@ -24,6 +24,18 @@ logger = init_logger("test_pipeline_parallel") ...@@ -24,6 +24,18 @@ logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
For PP, we fall back to V0 by default. This means
that the TP baseline runs with V1 while the PP engine
runs with V0. This gives divergent results with dummy
weights. Once we enable V1 by default for PP, we can
remove this.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
class ParallelSetup(NamedTuple): class ParallelSetup(NamedTuple):
tp_size: int tp_size: int
pp_size: int pp_size: int
......
...@@ -21,6 +21,15 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [ ...@@ -21,6 +21,15 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
def vllm_to_hf_output( def vllm_to_hf_output(
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
decoder_prompt_type: DecoderPromptType, decoder_prompt_type: DecoderPromptType,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment