Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import ctypes import ctypes
import importlib.util import importlib.util
import json
import logging import logging
import os import os
import re import re
...@@ -9,13 +10,11 @@ import subprocess ...@@ -9,13 +10,11 @@ import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
from shutil import which from shutil import which
from typing import Dict, List
import torch import torch
from packaging.version import Version, parse from packaging.version import Version, parse
from setuptools import Extension, setup from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext
from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
from typing import Optional, Union from typing import Optional, Union
...@@ -35,7 +34,7 @@ def load_module_from_path(module_name, path): ...@@ -35,7 +34,7 @@ def load_module_from_path(module_name, path):
return module return module
ROOT_DIR = os.path.dirname(__file__) ROOT_DIR = Path(__file__).parent
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# cannot import envs directly because it depends on vllm, # cannot import envs directly because it depends on vllm,
...@@ -62,7 +61,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None ...@@ -62,7 +61,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
# fallback to cpu # fallback to cpu
VLLM_TARGET_DEVICE = "cpu" VLLM_TARGET_DEVICE = "cpu"
MAIN_CUDA_VERSION = "12.1" MAIN_CUDA_VERSION = "12.4"
def is_sccache_available() -> bool: def is_sccache_available() -> bool:
...@@ -77,6 +76,18 @@ def is_ninja_available() -> bool: ...@@ -77,6 +76,18 @@ def is_ninja_available() -> bool:
return which("ninja") is not None return which("ninja") is not None
def is_url_available(url: str) -> bool:
from urllib.request import urlopen
status = None
try:
with urlopen(url) as f:
status = f.status
except Exception:
return False
return status == 200
class CMakeExtension(Extension): class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
...@@ -86,7 +97,7 @@ class CMakeExtension(Extension): ...@@ -86,7 +97,7 @@ class CMakeExtension(Extension):
class cmake_build_ext(build_ext): class cmake_build_ext(build_ext):
# A dict of extension directories that have been configured. # A dict of extension directories that have been configured.
did_config: Dict[str, bool] = {} did_config: dict[str, bool] = {}
# #
# Determine number of compilation jobs and optionally nvcc compile threads. # Determine number of compilation jobs and optionally nvcc compile threads.
...@@ -278,16 +289,43 @@ class repackage_wheel(build_ext): ...@@ -278,16 +289,43 @@ class repackage_wheel(build_ext):
"""Extracts libraries and other files from an existing wheel.""" """Extracts libraries and other files from an existing wheel."""
def get_base_commit_in_main_branch(self) -> str: def get_base_commit_in_main_branch(self) -> str:
import subprocess # Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
try: try:
# Get the latest commit hash of the upstream main branch.
resp_json = subprocess.check_output([
"curl", "-s",
"https://api.github.com/repos/vllm-project/vllm/commits/main"
]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
# Check if the upstream_main_commit exists in the local repo
try:
subprocess.check_output(
["git", "cat-file", "-e", f"{upstream_main_commit}"])
except subprocess.CalledProcessError:
# If not present, fetch it from the remote repository.
# Note that this does not update any local branches,
# but ensures that this commit ref and its history are
# available in our local repo.
subprocess.check_call([
"git", "fetch", "https://github.com/vllm-project/vllm",
"main"
])
# Then get the commit hash of the current branch that is the same as
# the upstream main commit.
current_branch = subprocess.check_output( current_branch = subprocess.check_output(
["git", "branch", "--show-current"]).decode("utf-8").strip() ["git", "branch", "--show-current"]).decode("utf-8").strip()
base_commit = subprocess.check_output( base_commit = subprocess.check_output([
["git", "merge-base", "main", "git", "merge-base", f"{upstream_main_commit}", current_branch
current_branch]).decode("utf-8").strip() ]).decode("utf-8").strip()
return base_commit return base_commit
except ValueError as err:
raise ValueError(err) from None
except Exception as err: except Exception as err:
logger.warning( logger.warning(
"Failed to get the base commit in the main branch. " "Failed to get the base commit in the main branch. "
...@@ -303,6 +341,10 @@ class repackage_wheel(build_ext): ...@@ -303,6 +341,10 @@ class repackage_wheel(build_ext):
if wheel_location is None: if wheel_location is None:
base_commit = self.get_base_commit_in_main_branch() base_commit = self.get_base_commit_in_main_branch()
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if not is_url_available(wheel_location):
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import zipfile import zipfile
...@@ -336,6 +378,7 @@ class repackage_wheel(build_ext): ...@@ -336,6 +378,7 @@ class repackage_wheel(build_ext):
files_to_copy = [ files_to_copy = [
"vllm/_C.abi3.so", "vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so", "vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py", "vllm/vllm_flash_attn/flash_attn_interface.py",
...@@ -413,10 +456,6 @@ def _is_cpu() -> bool: ...@@ -413,10 +456,6 @@ def _is_cpu() -> bool:
return VLLM_TARGET_DEVICE == "cpu" return VLLM_TARGET_DEVICE == "cpu"
def _is_openvino() -> bool:
return VLLM_TARGET_DEVICE == "openvino"
def _is_xpu() -> bool: def _is_xpu() -> bool:
return VLLM_TARGET_DEVICE == "xpu" return VLLM_TARGET_DEVICE == "xpu"
...@@ -486,10 +525,6 @@ def get_nvcc_cuda_version() -> Version: ...@@ -486,10 +525,6 @@ def get_nvcc_cuda_version() -> Version:
return nvcc_cuda_version return nvcc_cuda_version
def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath)
def get_sha(root: Union[str, Path]) -> str: def get_sha(root: Union[str, Path]) -> str:
try: try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip() return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
...@@ -525,9 +560,9 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -525,9 +560,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f""" new_version_content = f"""
try: try:
__version__ = "0.7.3" __version__ = "0.8.2"
__version_tuple__ = (0, 7, 3) __version_tuple__ = (0, 8, 2)
__hcu_version__ = f'0.7.3+{version}' __hcu_version__ = f'0.8.2+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__ from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e: except Exception as e:
...@@ -571,6 +606,7 @@ def get_gaudi_sw_version(): ...@@ -571,6 +606,7 @@ def get_gaudi_sw_version():
def get_vllm_version() -> str: def get_vllm_version() -> str:
if not _is_hip(): if not _is_hip():
from setuptools_scm import get_version
version = get_version(write_to="vllm/_version.py") version = get_version(write_to="vllm/_version.py")
sep = "+" if "+" not in version else "." # dev versions might contain + sep = "+" if "+" not in version else "." # dev versions might contain +
...@@ -605,8 +641,6 @@ def get_vllm_version() -> str: ...@@ -605,8 +641,6 @@ def get_vllm_version() -> str:
if gaudi_sw_version != MAIN_CUDA_VERSION: if gaudi_sw_version != MAIN_CUDA_VERSION:
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3] gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
version += f"{sep}gaudi{gaudi_sw_version}" version += f"{sep}gaudi{gaudi_sw_version}"
elif _is_openvino():
version += f"{sep}openvino"
elif _is_tpu(): elif _is_tpu():
version += f"{sep}tpu" version += f"{sep}tpu"
elif _is_cpu(): elif _is_cpu():
...@@ -620,11 +654,12 @@ def get_vllm_version() -> str: ...@@ -620,11 +654,12 @@ def get_vllm_version() -> str:
return version return version
def get_requirements() -> List[str]: def get_requirements() -> list[str]:
"""Get Python package dependencies from requirements.txt.""" """Get Python package dependencies from requirements.txt."""
requirements_dir = ROOT_DIR / "requirements"
def _read_requirements(filename: str) -> List[str]: def _read_requirements(filename: str) -> list[str]:
with open(get_path(filename)) as f: with open(requirements_dir / filename) as f:
requirements = f.read().strip().split("\n") requirements = f.read().strip().split("\n")
resolved_requirements = [] resolved_requirements = []
for line in requirements: for line in requirements:
...@@ -637,37 +672,34 @@ def get_requirements() -> List[str]: ...@@ -637,37 +672,34 @@ def get_requirements() -> List[str]:
return resolved_requirements return resolved_requirements
if _no_device(): if _no_device():
requirements = _read_requirements("requirements-common.txt") requirements = _read_requirements("common.txt")
elif _is_cuda(): elif _is_cuda():
requirements = _read_requirements("requirements-cuda.txt") requirements = _read_requirements("cuda.txt")
cuda_major, cuda_minor = torch.version.cuda.split(".") cuda_major, cuda_minor = torch.version.cuda.split(".")
modified_requirements = [] modified_requirements = []
for req in requirements: for req in requirements:
if ("vllm-flash-attn" in req if ("vllm-flash-attn" in req and cuda_major != "12"):
and not (cuda_major == "12" and cuda_minor == "1")): # vllm-flash-attn is built only for CUDA 12.x.
# vllm-flash-attn is built only for CUDA 12.1.
# Skip for other versions. # Skip for other versions.
continue continue
modified_requirements.append(req) modified_requirements.append(req)
requirements = modified_requirements requirements = modified_requirements
elif _is_hip(): elif _is_hip():
requirements = _read_requirements("requirements-rocm.txt") requirements = _read_requirements("rocm.txt")
elif _is_neuron(): elif _is_neuron():
requirements = _read_requirements("requirements-neuron.txt") requirements = _read_requirements("neuron.txt")
elif _is_hpu(): elif _is_hpu():
requirements = _read_requirements("requirements-hpu.txt") requirements = _read_requirements("hpu.txt")
elif _is_openvino():
requirements = _read_requirements("requirements-openvino.txt")
elif _is_tpu(): elif _is_tpu():
requirements = _read_requirements("requirements-tpu.txt") requirements = _read_requirements("tpu.txt")
elif _is_cpu(): elif _is_cpu():
requirements = _read_requirements("requirements-cpu.txt") requirements = _read_requirements("cpu.txt")
elif _is_xpu(): elif _is_xpu():
requirements = _read_requirements("requirements-xpu.txt") requirements = _read_requirements("xpu.txt")
else: else:
raise ValueError( raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, " "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
"OpenVINO, or CPU.") "or CPU.")
return requirements return requirements
...@@ -685,6 +717,11 @@ if _is_cuda(): ...@@ -685,6 +717,11 @@ if _is_cuda():
# FA3 requires CUDA 12.0 or later # FA3 requires CUDA 12.0 or later
ext_modules.append( ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# Optional since this doesn't get built (produce an .so file) when
# not targeting a hopper system
ext_modules.append(
CMakeExtension(name="vllm._flashmla_C", optional=True))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator")) ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
if _build_custom_ops(): if _build_custom_ops():
...@@ -719,6 +756,7 @@ setup( ...@@ -719,6 +756,7 @@ setup(
install_requires=get_requirements(), install_requires=get_requirements(),
extras_require={ extras_require={
"tensorizer": ["tensorizer>=2.9.0"], "tensorizer": ["tensorizer>=2.9.0"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile"], # Required for audio processing "audio": ["librosa", "soundfile"], # Required for audio processing
"video": ["decord"] # Required for video processing "video": ["decord"] # Required for video processing
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""vllm.entrypoints.api_server with some extra logging for testing.""" """vllm.entrypoints.api_server with some extra logging for testing."""
from typing import Any, Dict, Iterable from collections.abc import Iterable
from typing import Any
import uvicorn import uvicorn
from fastapi.responses import JSONResponse, Response from fastapi.responses import JSONResponse, Response
...@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine): ...@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
self._num_aborts += len(ids) self._num_aborts += len(ids)
await super()._engine_abort(ids) await super()._engine_abort(ids)
def testing_stats(self) -> Dict[str, Any]: def testing_stats(self) -> dict[str, Any]:
return {"num_aborted_requests": self._num_aborts} return {"num_aborted_requests": self._num_aborts}
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import subprocess import subprocess
import sys import sys
import time import time
...@@ -46,7 +47,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): ...@@ -46,7 +47,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
distributed_executor_backend, distributed_executor_backend,
] ]
uvicorn_process = subprocess.Popen(commands) # API Server Test Requires V0.
my_env = os.environ.copy()
my_env["VLLM_USE_V1"] = "0"
uvicorn_process = subprocess.Popen(commands, env=my_env)
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()
......
...@@ -6,7 +6,7 @@ import uuid ...@@ -6,7 +6,7 @@ import uuid
from asyncio import CancelledError from asyncio import CancelledError
from copy import copy from copy import copy
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional from typing import Optional
import pytest import pytest
import pytest_asyncio import pytest_asyncio
...@@ -152,6 +152,10 @@ def uid() -> str: ...@@ -152,6 +152,10 @@ def uid() -> str:
@pytest_asyncio.fixture(scope="module") @pytest_asyncio.fixture(scope="module")
async def async_engine(): async def async_engine():
# We cannot use monkeypatch since this is a module
# scoped fixture and monkeypatch is function scoped.
previous_value = os.getenv("VLLM_USE_V1", None)
os.environ["VLLM_USE_V1"] = "0"
engine = await asyncio.get_event_loop().run_in_executor(executor=None, engine = await asyncio.get_event_loop().run_in_executor(executor=None,
func=start_engine) func=start_engine)
try: try:
...@@ -162,6 +166,11 @@ async def async_engine(): ...@@ -162,6 +166,11 @@ async def async_engine():
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
if previous_value:
os.environ["VLLM_USE_V1"] = previous_value
else:
del os.environ["VLLM_USE_V1"]
@pytest.fixture() @pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool: def should_do_global_cleanup_after_test(request) -> bool:
...@@ -255,7 +264,7 @@ async def test_output_kinds(async_engine, stop): ...@@ -255,7 +264,7 @@ async def test_output_kinds(async_engine, stop):
params.output_kind = RequestOutputKind.DELTA params.output_kind = RequestOutputKind.DELTA
prompt_tokens = None prompt_tokens = None
output_tokens: List[int] = [] output_tokens: list[int] = []
output_text = "" output_text = ""
output_count = 0 output_count = 0
final_output = None final_output = None
......
...@@ -9,7 +9,6 @@ import weakref ...@@ -9,7 +9,6 @@ import weakref
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..conftest import VllmRunner from ..conftest import VllmRunner
...@@ -36,7 +35,7 @@ def v1(run_with_both_engines): ...@@ -36,7 +35,7 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed(): def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted""" """Verify vllm instance is GC'ed when it is deleted"""
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), load_format=LoadFormat.RUNAI_STREAMER) llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
weak_llm = weakref.ref(llm) weak_llm = weakref.ref(llm)
del llm del llm
# If there's any circular reference to vllm, this fails # If there's any circular reference to vllm, this fails
...@@ -45,12 +44,12 @@ def test_vllm_gc_ed(): ...@@ -45,12 +44,12 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True]) @pytest.mark.parametrize("enforce_eager", [False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
...@@ -63,51 +62,54 @@ def test_models( ...@@ -63,51 +62,54 @@ def test_models(
pytest.skip("Flashinfer does not support ROCm/HIP.") pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend in ("XFORMERS", if backend in ("XFORMERS",
"FLASHINFER") and model == "google/gemma-2-2b-it": "FLASHINFER") and model == os.path.join(models_path_prefix, "google/gemma-2-2b-it"):
pytest.skip( pytest.skip(
f"{backend} does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2 # 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096 # gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window # we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join( prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:" str(i) for i in range(1024)) + " are:"
example_prompts = [prompt] example_prompts = [prompt]
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model, with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
# @multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
# @pytest.mark.parametrize( # @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, " # "model, distributed_executor_backend, attention_backend, "
# "test_suite", [ # "test_suite", [
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "L4"), # ("distilbert/distilgpt2", "ray", "", "L4"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "L4"), # ("distilbert/distilgpt2", "mp", "", "L4"),
# (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "ray", "", "L4"), # ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
# (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "mp", "", "L4"), # ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "A100"), # ("distilbert/distilgpt2", "ray", "", "A100"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "A100"), # ("distilbert/distilgpt2", "mp", "", "A100"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "FLASHINFER", "A100"), # ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
# (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray", "FLASHINFER", "A100"), # ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
# ]) # ])
# def test_models_distributed( # def test_models_distributed(
# monkeypatch: pytest.MonkeyPatch,
# hf_runner, # hf_runner,
# vllm_runner, # vllm_runner,
# example_prompts, # example_prompts,
...@@ -120,35 +122,41 @@ def test_models( ...@@ -120,35 +122,41 @@ def test_models(
# if test_suite != TARGET_TEST_SUITE: # if test_suite != TARGET_TEST_SUITE:
# pytest.skip(f"Skip test for {test_suite}") # pytest.skip(f"Skip test for {test_suite}")
# if model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct") and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa # with monkeypatch.context() as monkeypatch_context:
# # test ray adag # if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" # # test Ray Compiled Graph
# os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" # monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
# monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
# if attention_backend:
# os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend # if attention_backend:
# monkeypatch_context.setenv(
# dtype = "half" # "VLLM_ATTENTION_BACKEND",
# max_tokens = 5 # attention_backend,
# )
# # NOTE: take care of the order. run vLLM first, and then run HF.
# # vLLM needs a fresh new process without cuda initialization. # dtype = "half"
# # if we run HF first, the cuda initialization will be done and it # max_tokens = 5
# # will hurt multiprocessing backend with fork method (the default method).
# with vllm_runner(model, # # NOTE: take care of the order. run vLLM first, and then run HF.
# dtype=dtype, # # vLLM needs a fresh new process without cuda initialization.
# tensor_parallel_size=2, # # if we run HF first, the cuda initialization will be done and it
# distributed_executor_backend=distributed_executor_backend # # will hurt multiprocessing backend with fork method
# ) as vllm_model: # # (the default method).
# vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) # with vllm_runner(
# model,
# with hf_runner(model, dtype=dtype) as hf_model: # dtype=dtype,
# hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) # tensor_parallel_size=2,
# distributed_executor_backend=distributed_executor_backend,
# check_outputs_equal( # ) as vllm_model:
# outputs_0_lst=hf_outputs, # vllm_outputs = vllm_model.generate_greedy(example_prompts,
# outputs_1_lst=vllm_outputs, # max_tokens)
# name_0="hf",
# name_1="vllm", # with hf_runner(model, dtype=dtype) as hf_model:
# ) # hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
# check_outputs_equal(
# outputs_0_lst=hf_outputs,
# outputs_1_lst=vllm_outputs,
# name_0="hf",
# name_1="vllm",
# )
...@@ -7,24 +7,41 @@ prefill requests are chunked. ...@@ -7,24 +7,41 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`. Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file.
"""
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
...@@ -35,8 +52,8 @@ MODELS = [ ...@@ -35,8 +52,8 @@ MODELS = [
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models( def test_models(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
...@@ -45,37 +62,39 @@ def test_models( ...@@ -45,37 +62,39 @@ def test_models(
enforce_eager: bool, enforce_eager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Checks exact match decode between huggingface model and vllm runner with Checks exact match decode between huggingface model and vllm runner with
chunked prefill. chunked prefill.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True, enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
...@@ -83,57 +102,61 @@ def test_models( ...@@ -83,57 +102,61 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
distributed_executor_backend: str, distributed_executor_backend: str,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct" if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"): and distributed_executor_backend == "ray"):
# test ray adag # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
chunked_prefill_token_size = 16 chunked_prefill_token_size = 16
# Add a chunked prefill config. # Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256) max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1 assert chunked_prefill_token_size != -1
enable_chunked_prefill = True enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with
# fork method (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( with vllm_runner(
outputs_0_lst=hf_outputs, model,
outputs_1_lst=vllm_outputs, dtype=dtype,
name_0="hf", tensor_parallel_size=2,
name_1="vllm", max_num_seqs=max_num_seqs,
) enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
# @pytest.mark.parametrize( # @pytest.mark.parametrize(
...@@ -151,7 +174,7 @@ def test_models_distributed( ...@@ -151,7 +174,7 @@ def test_models_distributed(
# # the async postprocessor # # the async postprocessor
# @pytest.mark.parametrize("disable_async_output_proc", [True]) # @pytest.mark.parametrize("disable_async_output_proc", [True])
# def test_models_with_fp8_kv_cache( # def test_models_with_fp8_kv_cache(
# vllm_runner, # vllm_runner: VllmRunner,
# example_prompts, # example_prompts,
# kv_cache_dtype: str, # kv_cache_dtype: str,
# model: str, # model: str,
...@@ -211,7 +234,7 @@ def test_models_distributed( ...@@ -211,7 +234,7 @@ def test_models_distributed(
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching( def test_with_prefix_caching(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
...@@ -247,8 +270,10 @@ def test_with_prefix_caching( ...@@ -247,8 +270,10 @@ def test_with_prefix_caching(
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt], outputs[enable] += vllm_model.generate_greedy(
max_tokens) [prompt],
max_tokens,
)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=outputs[False], outputs_0_lst=outputs[False],
...@@ -259,7 +284,7 @@ def test_with_prefix_caching( ...@@ -259,7 +284,7 @@ def test_with_prefix_caching(
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
...@@ -267,8 +292,8 @@ def test_with_prefix_caching( ...@@ -267,8 +292,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu( def test_models_cpu(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
...@@ -276,7 +301,7 @@ def test_models_cpu( ...@@ -276,7 +301,7 @@ def test_models_cpu(
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
enforce_eager: bool, enforce_eager: bool,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
test_models( test_models(
hf_runner, hf_runner,
...@@ -296,11 +321,11 @@ def test_models_cpu( ...@@ -296,11 +321,11 @@ def test_models_cpu(
@pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32]) @pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu( def test_with_prefix_caching_cpu(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import os
import pytest
from ..utils import compare_two_settings, models_path_prefix from ..utils import compare_two_settings, models_path_prefix
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
monkeypatch.setenv('VLLM_USE_V1', '0')
def test_cpu_offload(): def test_cpu_offload():
compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), [], compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), [],
["--cpu-offload-gb", "1"]) ["--cpu-offload-gb", "1"])
...@@ -4,15 +4,13 @@ import pytest ...@@ -4,15 +4,13 @@ import pytest
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
from ..conftest import MODEL_WEIGHTS_S3_BUCKET from ..utils import create_new_process_for_each_test
from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_python_error(): def test_python_error():
""" """
Test if Python error occurs when there's low-level Test if Python error occurs when there's low-level
...@@ -38,7 +36,7 @@ def test_python_error(): ...@@ -38,7 +36,7 @@ def test_python_error():
allocator.wake_up() allocator.wake_up()
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_basic_cumem(): def test_basic_cumem():
# some tensors from default memory pool # some tensors from default memory pool
shape = (1024, 1024) shape = (1024, 1024)
...@@ -71,7 +69,7 @@ def test_basic_cumem(): ...@@ -71,7 +69,7 @@ def test_basic_cumem():
assert torch.allclose(output, torch.ones_like(output) * 3) assert torch.allclose(output, torch.ones_like(output) * 3)
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_cumem_with_cudagraph(): def test_cumem_with_cudagraph():
allocator = CuMemAllocator.get_instance() allocator = CuMemAllocator.get_instance()
with allocator.use_memory_pool(): with allocator.use_memory_pool():
...@@ -116,43 +114,47 @@ def test_cumem_with_cudagraph(): ...@@ -116,43 +114,47 @@ def test_cumem_with_cudagraph():
assert torch.allclose(y, x + 1) assert torch.allclose(y, x + 1)
@fork_new_process_for_each_test @create_new_process_for_each_test()
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model, use_v1", "model, use_v1",
[ [
# sleep mode with safetensors # sleep mode with safetensors
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True), ("meta-llama/Llama-3.2-1B", True),
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), ("facebook/opt-125m", False),
]) ])
def test_end_to_end(model: str, use_v1: bool): def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
import os with monkeypatch.context() as m:
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info() free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running used_bytes_baseline = total - free # in case other process is running
load_format = LoadFormat.AUTO llm = LLM(model, enable_sleep_mode=True)
if "Llama" in model: prompt = "How are you?"
load_format = LoadFormat.RUNAI_STREAMER sampling_params = SamplingParams(temperature=0, max_tokens=10)
llm = LLM(model, load_format=load_format, enable_sleep_mode=True) output = llm.generate(prompt, sampling_params)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10) # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
output = llm.generate(prompt, sampling_params) # which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, llm.sleep(level=1)
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here. free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
llm.sleep(level=1) used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() # and it should be less than the model weights (1B model, 2GiB weights)
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool, # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# and it should be less than the model weights (1B model, 2GiB weights) # is captured but cannot be releasesd from PyTorch due to a known bug,
assert used_bytes < 2 * GiB_bytes # therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
llm.wake_up() # in V1.
output2 = llm.generate(prompt, sampling_params) if use_v1:
assert used_bytes < 7 * GiB_bytes
# cmp output else:
assert output[0].outputs[0].text == output2[0].outputs[0].text assert used_bytes < 2 * GiB_bytes
del os.environ["VLLM_USE_V1"] llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
...@@ -24,6 +24,15 @@ MODELS = [ ...@@ -24,6 +24,15 @@ MODELS = [
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
so use VLLM_USE_V1=0 for all tests in the file.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(scope="module", autouse=True) @pytest.fixture(scope="module", autouse=True)
def check_settings(): def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, ( assert ENABLE_ARTIFICIAL_PREEMPT is True, (
......
# SPDX-License-Identifier: Apache-2.0
import Cython.Compiler.Options
from Cython.Build import cythonize
from setuptools import setup
Cython.Compiler.Options.annotate = True
infiles = []
infiles += [
"vllm/engine/llm_engine.py",
"vllm/transformers_utils/detokenizer.py",
"vllm/engine/output_processor/single_step.py",
"vllm/outputs.py",
"vllm/engine/output_processor/stop_checker.py",
]
infiles += [
"vllm/core/scheduler.py",
"vllm/sequence.py",
"vllm/core/block_manager.py",
]
infiles += [
"vllm/model_executor/layers/sampler.py",
"vllm/sampling_params.py",
"vllm/utils.py",
]
setup(ext_modules=cythonize(infiles,
annotate=False,
force=True,
compiler_directives={
'language_level': "3",
'infer_types': True
}))
# example usage: python3 build_cython.py build_ext --inplace
...@@ -6,6 +6,7 @@ from typing import Callable, Union ...@@ -6,6 +6,7 @@ from typing import Callable, Union
from torch import fx from torch import fx
from vllm.compilation.inductor_pass import InductorPass from vllm.compilation.inductor_pass import InductorPass
from vllm.config import get_current_vllm_config
class TestBackend: class TestBackend:
...@@ -13,21 +14,27 @@ class TestBackend: ...@@ -13,21 +14,27 @@ class TestBackend:
This class provides a simple Inductor backend that can be used for testing. This class provides a simple Inductor backend that can be used for testing.
It takes a list of custom passes and runs them after Inductor's passes. It takes a list of custom passes and runs them after Inductor's passes.
It also saves the graph before and after the custom passes for inspection. It also saves the graph before and after the custom passes for inspection.
Inductor config can be modified directly by editing the inductor_config
property. This can be helpful for adding passes like the
'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
Inductor config is default-initialized from VllmConfig.CompilationConfig.
""" """
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
None]]): None]]):
self.custom_passes = list(passes) self.custom_passes = list(passes)
from torch._inductor import config compile_config = get_current_vllm_config().compilation_config
self.current_config = config.shallow_copy_dict() self.inductor_config = compile_config.inductor_compile_config
self.current_config['force_disable_caches'] = True self.inductor_config['force_disable_caches'] = True
self.current_config['post_grad_custom_post_pass'] = self.post_pass self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
def __call__(self, graph: fx.GraphModule, example_inputs): def __call__(self, graph: fx.GraphModule, example_inputs):
self.graph_pre_compile = deepcopy(graph)
from torch._inductor.compile_fx import compile_fx from torch._inductor.compile_fx import compile_fx
return compile_fx(graph, return compile_fx(graph,
example_inputs, example_inputs,
config_patches=self.current_config) config_patches=self.inductor_config)
def post_pass(self, graph: fx.Graph): def post_pass(self, graph: fx.Graph):
self.graph_pre_pass = deepcopy(graph) self.graph_pre_pass = deepcopy(graph)
......
# SPDX-License-Identifier: Apache-2.0
import pytest
# TEST V1: this should be removed. Right now V1 overrides
# all the torch compile logic. We should re-enable this
# as we add torch compile support back to V1.
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
...@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are ...@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed. initialized randomly with a fixed seed.
""" """
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, List, Optional, Tuple from typing import Any, Optional
import torch import torch
from torch import nn from torch import nn
...@@ -56,7 +56,7 @@ class LlamaConfig: ...@@ -56,7 +56,7 @@ class LlamaConfig:
random_seed: int = 0 random_seed: int = 0
def compute_hash(self) -> str: def compute_hash(self) -> str:
factors: List[Any] = [] factors: list[Any] = []
for k, v in self.__dict__.items(): for k, v in self.__dict__.items():
if k == "random_seed": if k == "random_seed":
continue continue
...@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module): ...@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module):
positions: torch.Tensor, positions: torch.Tensor,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
residual: Optional[torch.Tensor], residual: Optional[torch.Tensor],
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
For tractable computation: For tractable computation:
- if residual is None, the outputs are: - if residual is None, the outputs are:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses import dataclasses
from typing import Dict, List, Optional
import pytest import pytest
import os import os
...@@ -15,7 +15,7 @@ from ..utils import compare_all_settings, models_path_prefix ...@@ -15,7 +15,7 @@ from ..utils import compare_all_settings, models_path_prefix
@dataclasses.dataclass @dataclasses.dataclass
class TestSetting: class TestSetting:
model: str model: str
model_args: List[str] model_args: list[str]
pp_size: int pp_size: int
tp_size: int tp_size: int
attn_backend: str attn_backend: str
...@@ -23,75 +23,76 @@ class TestSetting: ...@@ -23,75 +23,76 @@ class TestSetting:
fullgraph: bool fullgraph: bool
# representative settings for testing
test_settings = [
# basic llama model
TestSetting(
model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
]
# we cannot afford testing the full Catesian product # we cannot afford testing the full Catesian product
# of all models and all levels # of all models and all levels
@pytest.mark.parametrize("test_setting", test_settings) @pytest.mark.parametrize(
def test_compile_correctness(test_setting: TestSetting): "test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed", "--dtype", "bfloat16"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices. # make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests. # don't use "<", as it will duplicate the tests.
...@@ -104,41 +105,45 @@ def test_compile_correctness(test_setting: TestSetting): ...@@ -104,41 +105,45 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size: if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.") pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
["-tp", str(tp_size)]
all_args: List[List[str]] = [] with monkeypatch.context() as m:
all_envs: List[Optional[Dict[str, str]]] = [] m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
for level in [ for level in [
CompilationLevel.NO_COMPILATION, CompilationLevel.NO_COMPILATION,
CompilationLevel.PIECEWISE, CompilationLevel.PIECEWISE,
]: ]:
all_args.append(final_args + [f"-O{level}"]) all_args.append(final_args + [f"-O{level}"])
all_envs.append({}) all_envs.append({})
# inductor will change the output, so we only compare if the output # inductor will change the output, so we only compare if the output
# is close, not exactly the same. # is close, not exactly the same.
compare_all_settings( compare_all_settings(
model, model,
all_args, all_args,
all_envs, all_envs,
method=method if method != "generate" else "generate_close") method=method if method != "generate" else "generate_close")
all_envs.clear() all_envs.clear()
all_args.clear() all_args.clear()
for level in [ for level in [
CompilationLevel.NO_COMPILATION, CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE, CompilationLevel.DYNAMO_ONCE,
]: ]:
all_args.append(final_args + [f"-O{level}"]) all_args.append(final_args + [f"-O{level}"])
all_envs.append({}) all_envs.append({})
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
# "DYNAMO_ONCE" will always use fullgraph # "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][ all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method) compare_all_settings(model, all_args * 3, all_envs, method=method)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import Any
import pytest import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test
@pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
from ..utils import fork_new_process_for_each_test if is_quant_method_supported("gptq_marlin"):
from .utils import TEST_MODELS, check_full_graph_support TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
@fork_new_process_for_each_test )
def test_full_graph(model_info, optimization_level): @pytest.mark.parametrize("model_info", "", indirect=True)
model = model_info[0] @create_new_process_for_each_test()
model_kwargs = model_info[1] def test_full_graph(
check_full_graph_support(model, monkeypatch: pytest.MonkeyPatch,
model_kwargs, model_info: tuple[str, dict[str, Any]],
optimization_level, optimization_level: int,
tp_size=1) ):
model, model_kwargs = model_info
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
...@@ -10,7 +10,7 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass ...@@ -10,7 +10,7 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey, from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
kFp8DynamicTokenSym, kFp8StaticTensorSym) kFp8DynamicTokenSym, kFp8StaticTensorSym)
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
from vllm.compilation.reshapes import RedundantReshapesPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig from vllm.config import CompilationConfig
from .backend import TestBackend from .backend import TestBackend
...@@ -52,11 +52,11 @@ def test_fix_functionalization(model: str, quant_key: QuantKey, ...@@ -52,11 +52,11 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
torch.set_default_device("cuda") torch.set_default_device("cuda")
config = CompilationConfig.PassConfig(enable_fusion=do_fusion, config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
enable_reshape=True) enable_noop=True)
reshape_pass = RedundantReshapesPass(config) noop_pass = NoOpEliminationPass(config)
fusion_pass = FusionPass.instance(config) fusion_pass = FusionPass.instance(config)
passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass] passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
func_pass = FixFunctionalizationPass(config) func_pass = FixFunctionalizationPass(config)
backend_func = TestBackend(*passes, func_pass) backend_func = TestBackend(*passes, func_pass)
backend_no_func = TestBackend(*passes) backend_no_func = TestBackend(*passes)
......
...@@ -5,23 +5,25 @@ import torch ...@@ -5,23 +5,25 @@ import torch
from compressed_tensors.quantization import FP8_DTYPE from compressed_tensors.quantization import FP8_DTYPE
import vllm.envs as envs import vllm.envs as envs
import vllm.plugins
from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey, from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
FusionPass, QuantKey) FusionPass, QuantKey)
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
from vllm.compilation.reshapes import RedundantReshapesPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
apply_fp8_linear) CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
from .backend import TestBackend from .backend import TestBackend
class TestModel(torch.nn.Module): class TestModel(torch.nn.Module):
def __init__(self, hidden_size: int, eps: float, static: bool, *args, def __init__(self, hidden_size: int, eps: float, static: bool,
**kwargs): cutlass_fp8_enabled: bool, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.cutlass_fp8_enabled = cutlass_fp8_enabled
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
if static: if static:
...@@ -32,24 +34,20 @@ class TestModel(torch.nn.Module): ...@@ -32,24 +34,20 @@ class TestModel(torch.nn.Module):
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
for _ in range(2) for _ in range(2)
] ]
self.fp8_linear = Fp8LinearOp(
cutlass_fp8_supported=cutlass_fp8_enabled,
use_per_token_if_dynamic=True)
def forward(self, x): def forward(self, x):
resid = torch.sqrt(x) resid = torch.sqrt(x)
y = self.norm[0](x) y = self.norm[0](x)
x2 = apply_fp8_linear(y, x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
self.w[0],
self.wscale[0],
self.scale[0],
use_per_token_if_dynamic=True)
# make sure resid is used for replacement to work # make sure resid is used for replacement to work
y2, resid = self.norm[1](x2, resid) y2, resid = self.norm[1](x2, resid)
x3 = apply_fp8_linear(y2, x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
self.w[1], self.scale[1])
self.wscale[1],
self.scale[1],
use_per_token_if_dynamic=True)
y3, resid = self.norm[2](x3, resid) # use resid here y3, resid = self.norm[2](x3, resid) # use resid here
return y3 return y3
...@@ -59,60 +57,67 @@ class TestModel(torch.nn.Module): ...@@ -59,60 +57,67 @@ class TestModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
@pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("static", [True, False]) @pytest.mark.parametrize("static", [True, False])
@pytest.mark.parametrize("cutlass_fp8_enabled",
[True, False] if CUTLASS_FP8_SUPPORTED else [False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
reason="Only test on CUDA") reason="Only test on CUDA")
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static): def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
cutlass_fp8_enabled):
torch.set_default_device("cuda") torch.set_default_device("cuda")
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
torch.manual_seed(1) torch.manual_seed(1)
maybe_create_device_identity() # needed for certain non-cutlass fp8 paths
# Reshape pass is needed for the fusion pass to work vllm_config = VllmConfig(compilation_config=CompilationConfig(
config = CompilationConfig.PassConfig(enable_fusion=True, level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
enable_reshape=True) with vllm.config.set_current_vllm_config(vllm_config):
reshape_pass = RedundantReshapesPass(config) # Reshape pass is needed for the fusion pass to work
fusion_pass = FusionPass.instance(config) config = CompilationConfig.PassConfig(enable_fusion=True,
enable_noop=True)
backend = TestBackend(reshape_pass, fusion_pass) noop_pass = NoOpEliminationPass(config)
model = TestModel(hidden_size, eps, static) fusion_pass = FusionPass.instance(config)
# First dimension dynamic backend = TestBackend(noop_pass, fusion_pass)
x = torch.rand(num_tokens, hidden_size) model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
torch._dynamo.mark_dynamic(x, 0)
# First dimension dynamic
result = model(x) x = torch.rand(num_tokens, hidden_size)
torch._dynamo.mark_dynamic(x, 0)
model2 = torch.compile(model, backend=backend)
result2 = model2(x) result = model(x)
# Higher tol for dynamic, even higher for bfloat16 model2 = torch.compile(model, backend=backend)
if static: result2 = model2(x)
ATOL, RTOL = (1e-3, 1e-3)
elif dtype == torch.float16: # Higher tol for dynamic, even higher for bfloat16
ATOL, RTOL = (2e-3, 2e-3) if static:
else: ATOL, RTOL = (1e-3, 1e-3)
ATOL, RTOL = (1e-2, 1e-2) elif dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) else:
ATOL, RTOL = (1e-2, 1e-2)
# Check substitution worked
pre_nodes = backend.graph_pre_pass.nodes torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
post_nodes = backend.graph_post_pass.nodes
# Check substitution worked
# static is per-tensor, dynamic is per-token pre_nodes = backend.graph_pre_pass.nodes
key = QuantKey(dtype=FP8_DTYPE, post_nodes = backend.graph_post_pass.nodes
static=static,
per_tensor=static, # static is per-tensor, dynamic is per-token
symmetric=True) key = QuantKey(dtype=FP8_DTYPE,
rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)] static=static,
add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)] per_tensor=static,
fp8_quant = QUANT_OPS[key] symmetric=True)
rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
# In pre-nodes, fp8 quant should be present and fused kernels should not add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
assert find_auto_fn_maybe(pre_nodes, rms_quant) is None fp8_quant = QUANT_OPS[key]
assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
find_auto_fn(pre_nodes, fp8_quant) # In pre-nodes, fp8 quant should be there and fused kernels should not
assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
# In post-nodes, fused kernels should be present and fp8 quant should not assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
find_auto_fn(post_nodes, rms_quant) find_auto_fn(pre_nodes, fp8_quant)
find_auto_fn(post_nodes, add_rms_quant)
assert find_auto_fn_maybe(post_nodes, fp8_quant) is None # In post-nodes, fused kernels should be there and fp8 quant should not
find_auto_fn(post_nodes, rms_quant)
find_auto_fn(post_nodes, add_rms_quant)
assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import copy
import pickle
import pytest import pytest
import torch import torch
from torch._inductor.codecache import BypassFxGraphCache
from vllm.compilation.config import CompilationConfig from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.compilation.inductor_pass import (CallableInductorPass,
as_inductor_pass)
from vllm.compilation.pass_manager import PostGradPassManager from vllm.compilation.pass_manager import PostGradPassManager
from vllm.config import CompilationConfig
# dummy custom pass that doesn't inherit
def simple_callable(graph: torch.fx.Graph): def simple_callable(graph: torch.fx.Graph):
pass pass
@as_inductor_pass(files=(__file__, )) # Should fail to add directly to the pass manager
def callable_decorated(graph: torch.fx.Graph): def test_bad_callable():
pass config = CompilationConfig().pass_config
pass_manager = PostGradPassManager()
pass_manager.configure(config)
with pytest.raises(AssertionError):
pass_manager.add(simple_callable) # noqa, type wrong on purpose
# Pass that inherits from InductorPass
class ProperPass(InductorPass):
def __call__(self, graph: torch.fx.graph.Graph) -> None:
pass
@pytest.mark.parametrize( @pytest.mark.parametrize(
"works, callable", "callable",
[(False, simple_callable), (True, callable_decorated), [
(True, CallableInductorPass(simple_callable, "simple_callable"))]) ProperPass(),
def test_pass_manager(works: bool, callable): # Can also wrap callables in CallableInductorPass for compliance
CallableInductorPass(simple_callable),
CallableInductorPass(simple_callable,
InductorPass.hash_source(__file__))
],
)
def test_pass_manager_uuid(callable):
config = CompilationConfig().pass_config config = CompilationConfig().pass_config
pass_manager = PostGradPassManager([callable])
pass_manager.configure(config) # Adds default passes pass_manager = PostGradPassManager()
pass_manager.configure(config)
if works:
pickle.dumps(pass_manager) # Check that UUID is different if the same pass is added 2x
else: pass_manager.add(callable)
with pytest.raises(BypassFxGraphCache): uuid1 = pass_manager.uuid()
pickle.dumps(pass_manager) pass_manager.add(callable)
uuid2 = pass_manager.uuid()
assert uuid1 != uuid2
# UUID should be the same as the original one,
# as we constructed in the same way.
pass_manager2 = PostGradPassManager()
pass_manager2.configure(config)
pass_manager2.add(callable)
assert uuid1 == pass_manager2.uuid()
# UUID should be different due to config change
config2 = copy.deepcopy(config)
config2.enable_fusion = not config2.enable_fusion
pass_manager3 = PostGradPassManager()
pass_manager3.configure(config2)
pass_manager3.add(callable)
assert uuid1 != pass_manager3.uuid()
...@@ -5,8 +5,8 @@ import os ...@@ -5,8 +5,8 @@ import os
import tempfile import tempfile
from collections import UserList from collections import UserList
from enum import Enum from enum import Enum
from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
TypedDict, TypeVar, Union) from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
import pytest import pytest
import pytest_html import pytest_html
...@@ -17,8 +17,8 @@ import torch.nn as nn ...@@ -17,8 +17,8 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from PIL import Image from PIL import Image
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding, from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
BatchFeature) BatchEncoding, BatchFeature)
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from tests.models.utils import (TokensTextLogprobs, from tests.models.utils import (TokensTextLogprobs,
...@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs, ...@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment, init_distributed_environment,
...@@ -37,8 +37,8 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, ...@@ -37,8 +37,8 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_list_of) from vllm.utils import cuda_device_count_stateless, is_list_of
from .utils import models_path_prefix from .utils import models_path_prefix
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -50,78 +50,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") ...@@ -50,78 +50,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M = TypeVar("_M") _M = TypeVar("_M")
MODELS_ON_S3 = [ _PromptMultiModalInput = Union[list[_M], list[list[_M]]]
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"ai21labs/Jamba-tiny-random",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
]
MODEL_WEIGHTS_S3_BUCKET = models_path_prefix
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
PromptImageInput = _PromptMultiModalInput[Image.Image] PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray] PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def _read_prompts(filename: str) -> List[str]: def _read_prompts(filename: str) -> list[str]:
with open(filename) as f: with open(filename) as f:
prompts = f.readlines() prompts = f.readlines()
return prompts return prompts
...@@ -144,7 +80,7 @@ class _ImageAssets(_ImageAssetsBase): ...@@ -144,7 +80,7 @@ class _ImageAssets(_ImageAssetsBase):
ImageAsset("cherry_blossom"), ImageAsset("cherry_blossom"),
]) ])
def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
""" """
Convenience method to define the prompt for each test image. Convenience method to define the prompt for each test image.
...@@ -169,7 +105,7 @@ class _VideoAssets(_VideoAssetsBase): ...@@ -169,7 +105,7 @@ class _VideoAssets(_VideoAssetsBase):
VideoAsset("sample_demo_1.mp4"), VideoAsset("sample_demo_1.mp4"),
]) ])
def prompts(self, prompts: _VideoAssetPrompts) -> List[str]: def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
return [prompts["sample_demo_1"]] return [prompts["sample_demo_1"]]
...@@ -179,6 +115,26 @@ VIDEO_ASSETS = _VideoAssets() ...@@ -179,6 +115,26 @@ VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`.""" """Singleton instance of :class:`_VideoAssets`."""
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
"""
The V1 oracle sets "VLLM_USE_V1" during loading. This means
that each invocation of a test change the env variable.
If we touch "VLLM_USE_V1" with monkeypatch, then any changes
made during the test run by vLLM will be cleaned up.
This fixture is used by every test.
"""
# If VLLM_USE_V1 is not set, set then delete. This will
# cause monkeypatch to clean up VLLM_USE_V1 upon exit
# if VLLM modifies the value of envs.VLLM_USE_V1.
if "VLLM_USE_V1" not in os.environ:
monkeypatch.setenv("VLLM_USE_V1", "")
monkeypatch.delenv("VLLM_USE_V1")
@pytest.fixture(params=[True, False]) @pytest.fixture(params=[True, False])
def run_with_both_engines(request, monkeypatch): def run_with_both_engines(request, monkeypatch):
# Automatically runs tests twice, once with V1 and once without # Automatically runs tests twice, once with V1 and once without
...@@ -242,7 +198,7 @@ def dynamo_reset(): ...@@ -242,7 +198,7 @@ def dynamo_reset():
@pytest.fixture @pytest.fixture
def example_prompts() -> List[str]: def example_prompts() -> list[str]:
prompts = [] prompts = []
for filename in _TEST_PROMPTS: for filename in _TEST_PROMPTS:
prompts += _read_prompts(filename) prompts += _read_prompts(filename)
...@@ -264,7 +220,7 @@ class DecoderPromptType(Enum): ...@@ -264,7 +220,7 @@ class DecoderPromptType(Enum):
@pytest.fixture @pytest.fixture
def example_encoder_decoder_prompts( def example_encoder_decoder_prompts(
) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]: ) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
''' '''
Returns an encoder prompt list and a decoder prompt list, wherein each pair Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt, of same-index entries in both lists corresponds to an (encoder prompt,
...@@ -296,7 +252,7 @@ def example_encoder_decoder_prompts( ...@@ -296,7 +252,7 @@ def example_encoder_decoder_prompts(
@pytest.fixture @pytest.fixture
def example_long_prompts() -> List[str]: def example_long_prompts() -> list[str]:
prompts = [] prompts = []
for filename in _LONG_PROMPTS: for filename in _LONG_PROMPTS:
prompts += _read_prompts(filename) prompts += _read_prompts(filename)
...@@ -319,13 +275,17 @@ _R = TypeVar("_R") ...@@ -319,13 +275,17 @@ _R = TypeVar("_R")
class HfRunner: class HfRunner:
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: def get_default_device(self):
from vllm.platforms import current_platform from vllm.platforms import current_platform
return ("cpu" if current_platform.is_cpu() else "cuda")
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
if x is None or isinstance(x, (bool, )): if x is None or isinstance(x, (bool, )):
return x return x
if device is None: if device is None:
device = "cpu" if current_platform.is_cpu() else "cuda" device = self.device
if isinstance(x, dict): if isinstance(x, dict):
return {k: self.wrap_device(v, device) for k, v in x.items()} return {k: self.wrap_device(v, device) for k, v in x.items()}
...@@ -338,45 +298,59 @@ class HfRunner: ...@@ -338,45 +298,59 @@ class HfRunner:
def __init__( def __init__(
self, self,
model_name: str, model_name: str,
dtype: str = "half", dtype: str = "auto",
*, *,
model_kwargs: Optional[Dict[str, Any]] = None, model_kwargs: Optional[dict[str, Any]] = None,
is_sentence_transformer: bool = False, is_sentence_transformer: bool = False,
is_cross_encoder: bool = False, is_cross_encoder: bool = False,
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
postprocess_inputs: Callable[..., BatchEncoding] = identity,
) -> None: ) -> None:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
self.model_name = model_name self.model_name = model_name
self.config = AutoConfig.from_pretrained(
model_name,
trust_remote_code=True,
)
self.device = self.get_default_device()
self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
model_kwargs = model_kwargs if model_kwargs is not None else {}
model_kwargs.setdefault("torch_dtype", torch_dtype)
if is_sentence_transformer: if is_sentence_transformer:
# Lazy init required for AMD CI # Lazy init required for AMD CI
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
self.model = self.wrap_device(
SentenceTransformer( self.model = SentenceTransformer(
model_name, model_name,
device="cpu", device=self.device,
trust_remote_code=True, model_kwargs=model_kwargs,
).to(dtype=torch_dtype)) trust_remote_code=True,
)
elif is_cross_encoder: elif is_cross_encoder:
# Lazy init required for AMD CI # Lazy init required for AMD CI
from sentence_transformers import CrossEncoder from sentence_transformers import CrossEncoder
self.model = CrossEncoder(model_name,
device="cpu", self.model = CrossEncoder(
trust_remote_code=True) model_name,
self.model.model = self.wrap_device(self.model.model)\ device=self.device,
.to(dtype=torch_dtype) automodel_args=model_kwargs,
trust_remote_code=True,
)
else: else:
model_kwargs = model_kwargs if model_kwargs is not None else {} model = auto_cls.from_pretrained(
self.model = self.wrap_device( model_name,
auto_cls.from_pretrained( trust_remote_code=True,
model_name, **model_kwargs,
torch_dtype=torch_dtype, )
trust_remote_code=True,
**model_kwargs, if (getattr(model, "quantization_method", None) != "bitsandbytes"
)) and len({p.device
for p in model.parameters()}) < 2):
model = model.to(self.device)
self.model = model
if not skip_tokenizer_init: if not skip_tokenizer_init:
self.tokenizer = AutoTokenizer.from_pretrained( self.tokenizer = AutoTokenizer.from_pretrained(
...@@ -396,16 +370,13 @@ class HfRunner: ...@@ -396,16 +370,13 @@ class HfRunner:
if skip_tokenizer_init: if skip_tokenizer_init:
self.tokenizer = self.processor.tokenizer self.tokenizer = self.processor.tokenizer
self.dtype = dtype
self.postprocess_inputs = postprocess_inputs
def get_inputs( def get_inputs(
self, self,
prompts: List[str], prompts: list[str],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
) -> List[BatchEncoding]: ) -> list[Union[BatchFeature, BatchEncoding]]:
if images is not None: if images is not None:
assert len(prompts) == len(images) assert len(prompts) == len(images)
...@@ -415,9 +386,9 @@ class HfRunner: ...@@ -415,9 +386,9 @@ class HfRunner:
if audios is not None: if audios is not None:
assert len(prompts) == len(audios) assert len(prompts) == len(audios)
all_inputs: List[BatchEncoding] = [] all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
for i, prompt in enumerate(prompts): for i, prompt in enumerate(prompts):
processor_kwargs: Dict[str, Any] = { processor_kwargs: dict[str, Any] = {
"text": prompt, "text": prompt,
"return_tensors": "pt", "return_tensors": "pt",
} }
...@@ -431,13 +402,14 @@ class HfRunner: ...@@ -431,13 +402,14 @@ class HfRunner:
processor_kwargs["sampling_rate"] = sr processor_kwargs["sampling_rate"] = sr
inputs = self.processor(**processor_kwargs) inputs = self.processor(**processor_kwargs)
inputs = self.postprocess_inputs(inputs, dtype=self.dtype) if isinstance(inputs, BatchFeature):
inputs = inputs.to(dtype=self.dtype)
all_inputs.append(inputs) all_inputs.append(inputs)
return all_inputs return all_inputs
def classify(self, prompts: List[str]) -> List[str]: def classify(self, prompts: list[str]) -> list[str]:
# output is final logits # output is final logits
all_inputs = self.get_inputs(prompts) all_inputs = self.get_inputs(prompts)
outputs = [] outputs = []
...@@ -450,21 +422,21 @@ class HfRunner: ...@@ -450,21 +422,21 @@ class HfRunner:
def generate( def generate(
self, self,
prompts: List[str], prompts: list[str],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> list[tuple[list[list[int]], list[str]]]:
all_inputs = self.get_inputs(prompts, all_inputs = self.get_inputs(prompts,
images=images, images=images,
videos=videos, videos=videos,
audios=audios) audios=audios)
outputs: List[Tuple[List[List[int]], List[str]]] = [] outputs: list[tuple[list[list[int]], list[str]]] = []
for inputs in all_inputs: for inputs in all_inputs:
output_ids = self.model.generate( output_ids = self.model.generate(
**self.wrap_device(inputs, device=self.model.device.type), **self.wrap_device(inputs),
use_cache=True, use_cache=True,
**kwargs, **kwargs,
) )
...@@ -479,13 +451,13 @@ class HfRunner: ...@@ -479,13 +451,13 @@ class HfRunner:
def generate_greedy( def generate_greedy(
self, self,
prompts: List[str], prompts: list[str],
max_tokens: int, max_tokens: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[int], str]]: ) -> list[tuple[list[int], str]]:
outputs = self.generate(prompts, outputs = self.generate(prompts,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
...@@ -499,10 +471,10 @@ class HfRunner: ...@@ -499,10 +471,10 @@ class HfRunner:
def generate_beam_search( def generate_beam_search(
self, self,
prompts: List[str], prompts: list[str],
beam_width: int, beam_width: int,
max_tokens: int, max_tokens: int,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> list[tuple[list[list[int]], list[str]]]:
outputs = self.generate(prompts, outputs = self.generate(prompts,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
...@@ -520,22 +492,22 @@ class HfRunner: ...@@ -520,22 +492,22 @@ class HfRunner:
def generate_greedy_logprobs( def generate_greedy_logprobs(
self, self,
prompts: List[str], prompts: list[str],
max_tokens: int, max_tokens: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[List[torch.Tensor]]: ) -> list[list[torch.Tensor]]:
all_inputs = self.get_inputs(prompts, all_inputs = self.get_inputs(prompts,
images=images, images=images,
videos=videos, videos=videos,
audios=audios) audios=audios)
all_logprobs: List[List[torch.Tensor]] = [] all_logprobs: list[list[torch.Tensor]] = []
for inputs in all_inputs: for inputs in all_inputs:
output = self.model.generate( output = self.model.generate(
**self.wrap_device(inputs, device=self.model.device.type), **self.wrap_device(inputs),
use_cache=True, use_cache=True,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
...@@ -550,11 +522,11 @@ class HfRunner: ...@@ -550,11 +522,11 @@ class HfRunner:
def _hidden_states_to_seq_logprobs( def _hidden_states_to_seq_logprobs(
self, self,
hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], hidden_states: tuple[tuple[torch.Tensor, ...], ...],
) -> List[torch.Tensor]: ) -> list[torch.Tensor]:
output_embeddings = self.model.get_output_embeddings() output_embeddings = self.model.get_output_embeddings()
seq_logprobs: List[torch.Tensor] = [] seq_logprobs: list[torch.Tensor] = []
for _, hidden_state in enumerate(hidden_states): for _, hidden_state in enumerate(hidden_states):
last_hidden_states = hidden_state[-1][0] last_hidden_states = hidden_state[-1][0]
logits = torch.matmul( logits = torch.matmul(
...@@ -570,14 +542,14 @@ class HfRunner: ...@@ -570,14 +542,14 @@ class HfRunner:
def _hidden_states_to_logprobs( def _hidden_states_to_logprobs(
self, self,
hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], hidden_states: tuple[tuple[torch.Tensor, ...], ...],
num_logprobs: int, num_logprobs: int,
) -> Tuple[List[Dict[int, float]], int]: ) -> tuple[list[dict[int, float]], int]:
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
output_len = len(hidden_states) output_len = len(hidden_states)
# convert to dict # convert to dict
seq_logprobs_lst: List[Dict[int, float]] = [] seq_logprobs_lst: list[dict[int, float]] = []
for tok_idx, tok_logprobs in enumerate(seq_logprobs): for tok_idx, tok_logprobs in enumerate(seq_logprobs):
# drop prompt logprobs # drop prompt logprobs
if tok_idx == 0: if tok_idx == 0:
...@@ -597,26 +569,26 @@ class HfRunner: ...@@ -597,26 +569,26 @@ class HfRunner:
def generate_greedy_logprobs_limit( def generate_greedy_logprobs_limit(
self, self,
prompts: List[str], prompts: list[str],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[TokensTextLogprobs]: ) -> list[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts, all_inputs = self.get_inputs(prompts,
images=images, images=images,
videos=videos, videos=videos,
audios=audios) audios=audios)
all_logprobs: List[List[Dict[int, float]]] = [] all_logprobs: list[list[dict[int, float]]] = []
all_output_ids: List[List[int]] = [] all_output_ids: list[list[int]] = []
all_output_strs: List[str] = [] all_output_strs: list[str] = []
for inputs in all_inputs: for inputs in all_inputs:
output = self.model.generate( output = self.model.generate(
**self.wrap_device(inputs, device=self.model.device.type), **self.wrap_device(inputs),
use_cache=True, use_cache=True,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
...@@ -644,51 +616,47 @@ class HfRunner: ...@@ -644,51 +616,47 @@ class HfRunner:
def generate_encoder_decoder_greedy_logprobs_limit( def generate_encoder_decoder_greedy_logprobs_limit(
self, self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[TokensTextLogprobs]: ) -> list[TokensTextLogprobs]:
''' '''
Greedy logprobs generation for vLLM encoder/decoder models Greedy logprobs generation for vLLM encoder/decoder models
''' '''
all_logprobs: List[List[Dict[int, float]]] = [] all_logprobs: list[list[dict[int, float]]] = []
all_output_ids: List[List[int]] = [] all_output_ids: list[list[int]] = []
all_output_strs: List[str] = [] all_output_strs: list[str] = []
for i, (encoder_prompt, decoder_prompt) in enumerate( for i, (encoder_prompt, decoder_prompt) in enumerate(
to_enc_dec_tuple_list(encoder_decoder_prompts)): to_enc_dec_tuple_list(encoder_decoder_prompts)):
processor_kwargs: Dict[str, Any] = { processor_kwargs: dict[str, Any] = {
"text": encoder_prompt, "text": encoder_prompt,
"return_tensors": "pt", "return_tensors": "pt",
} }
if images is not None and images[i] is not None: if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i] processor_kwargs["images"] = images[i]
encoder_input_ids = self.wrap_device( encoder_inputs = self.processor(**processor_kwargs)
self.processor(**processor_kwargs).input_ids, encoder_inputs = self.wrap_device(encoder_inputs)
device=self.model.device.type,
)
if decoder_prompt is None: if decoder_prompt is None:
decoder_input_ids = None decoder_input_ids = None
else: else:
decoder_input_ids = self.wrap_device( decoder_inputs = self.tokenizer(decoder_prompt,
self.tokenizer(decoder_prompt, return_tensors="pt")
return_tensors="pt").input_ids, decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
device=self.model.device.type,
)
output = self.model.generate( output = self.model.generate(
encoder_input_ids,
decoder_input_ids=decoder_input_ids, decoder_input_ids=decoder_input_ids,
use_cache=True, use_cache=True,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
output_hidden_states=True, output_hidden_states=True,
return_dict_in_generate=True, return_dict_in_generate=True,
**encoder_inputs,
**kwargs, **kwargs,
) )
...@@ -708,10 +676,10 @@ class HfRunner: ...@@ -708,10 +676,10 @@ class HfRunner:
return [(output_ids, output_str, output_logprobs) return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs] for output_ids, output_str, output_logprobs in outputs]
def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]: def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
return self.model.encode(prompts) return self.model.encode(prompts)
def predict(self, prompts: List[List[str]]) -> torch.Tensor: def predict(self, prompts: list[list[str]]) -> torch.Tensor:
return self.model.predict(prompts, convert_to_tensor=True) return self.model.predict(prompts, convert_to_tensor=True)
def __enter__(self): def __enter__(self):
...@@ -728,6 +696,18 @@ def hf_runner(): ...@@ -728,6 +696,18 @@ def hf_runner():
class VllmRunner: class VllmRunner:
"""
The default value of some arguments have been modified from
:class:`~vllm.LLM` as follows:
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
- `seed`: Set to `0` instead of `None` for test reproducibility.
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
- `block_size`: Set to `16` instead of `None` to reduce memory usage.
- `enable_chunked_prefill`: Set to `False` instead of `None` for
test reproducibility.
- `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
"""
def __init__( def __init__(
self, self,
...@@ -735,31 +715,26 @@ class VllmRunner: ...@@ -735,31 +715,26 @@ class VllmRunner:
task: TaskOption = "auto", task: TaskOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
# Use smaller max model length, otherwise bigger model cannot run due trust_remote_code: bool = True,
# to kv cache size limit. seed: Optional[int] = 0,
max_model_len: int = 1024, max_model_len: int = 1024,
dtype: str = "half", dtype: str = "auto",
disable_log_stats: bool = True, disable_log_stats: bool = True,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16, block_size: int = 16,
enable_chunked_prefill: bool = False, enable_chunked_prefill: Optional[bool] = False,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: Optional[bool] = False, enforce_eager: Optional[bool] = False,
load_format: Optional[LoadFormat] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if model_name in MODELS_ON_S3 and not load_format:
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
load_format = LoadFormat.RUNAI_STREAMER
if not load_format:
load_format = LoadFormat.AUTO
self.model = LLM( self.model = LLM(
model=model_name, model=model_name,
task=task, task=task,
tokenizer=tokenizer_name, tokenizer=tokenizer_name,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
trust_remote_code=True, trust_remote_code=trust_remote_code,
dtype=dtype, dtype=dtype,
seed=seed,
swap_space=swap_space, swap_space=swap_space,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,
...@@ -767,17 +742,16 @@ class VllmRunner: ...@@ -767,17 +742,16 @@ class VllmRunner:
max_model_len=max_model_len, max_model_len=max_model_len,
block_size=block_size, block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs, **kwargs,
) )
def get_inputs( def get_inputs(
self, self,
prompts: List[str], prompts: list[str],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
) -> List[TextPrompt]: ) -> list[TextPrompt]:
if images is not None: if images is not None:
assert len(prompts) == len(images) assert len(prompts) == len(images)
...@@ -807,13 +781,13 @@ class VllmRunner: ...@@ -807,13 +781,13 @@ class VllmRunner:
def generate( def generate(
self, self,
prompts: List[str], prompts: list[str],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> list[tuple[list[list[int]], list[str]]]:
inputs = self.get_inputs(prompts, inputs = self.get_inputs(prompts,
images=images, images=images,
videos=videos, videos=videos,
...@@ -823,12 +797,12 @@ class VllmRunner: ...@@ -823,12 +797,12 @@ class VllmRunner:
sampling_params=sampling_params, sampling_params=sampling_params,
**kwargs) **kwargs)
outputs: List[Tuple[List[List[int]], List[str]]] = [] outputs: list[tuple[list[list[int]], list[str]]] = []
for req_output in req_outputs: for req_output in req_outputs:
prompt_str = req_output.prompt prompt_str = req_output.prompt
prompt_ids = req_output.prompt_token_ids prompt_ids = req_output.prompt_token_ids
req_sample_output_ids: List[List[int]] = [] req_sample_output_ids: list[list[int]] = []
req_sample_output_strs: List[str] = [] req_sample_output_strs: list[str] = []
for sample in req_output.outputs: for sample in req_output.outputs:
output_str = sample.text output_str = sample.text
output_ids = list(sample.token_ids) output_ids = list(sample.token_ids)
...@@ -839,9 +813,9 @@ class VllmRunner: ...@@ -839,9 +813,9 @@ class VllmRunner:
@staticmethod @staticmethod
def _final_steps_generate_w_logprobs( def _final_steps_generate_w_logprobs(
req_outputs: List[RequestOutput], req_outputs: list[RequestOutput],
) -> List[TokensTextLogprobsPromptLogprobs]: ) -> list[TokensTextLogprobsPromptLogprobs]:
outputs: List[TokensTextLogprobsPromptLogprobs] = [] outputs: list[TokensTextLogprobsPromptLogprobs] = []
for req_output in req_outputs: for req_output in req_outputs:
assert len(req_output.outputs) > 0 assert len(req_output.outputs) > 0
for sample in req_output.outputs: for sample in req_output.outputs:
...@@ -854,14 +828,14 @@ class VllmRunner: ...@@ -854,14 +828,14 @@ class VllmRunner:
def generate_w_logprobs( def generate_w_logprobs(
self, self,
prompts: List[str], prompts: list[str],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
**kwargs: Any, **kwargs: Any,
) -> Union[List[TokensTextLogprobs], ) -> Union[list[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]: list[TokensTextLogprobsPromptLogprobs]]:
inputs = self.get_inputs(prompts, inputs = self.get_inputs(prompts,
images=images, images=images,
videos=videos, videos=videos,
...@@ -880,10 +854,10 @@ class VllmRunner: ...@@ -880,10 +854,10 @@ class VllmRunner:
def generate_encoder_decoder_w_logprobs( def generate_encoder_decoder_w_logprobs(
self, self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
sampling_params: SamplingParams, sampling_params: SamplingParams,
) -> Union[List[TokensTextLogprobs], ) -> Union[list[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]: list[TokensTextLogprobsPromptLogprobs]]:
''' '''
Logprobs generation for vLLM encoder/decoder models Logprobs generation for vLLM encoder/decoder models
''' '''
...@@ -900,13 +874,13 @@ class VllmRunner: ...@@ -900,13 +874,13 @@ class VllmRunner:
def generate_greedy( def generate_greedy(
self, self,
prompts: List[str], prompts: list[str],
max_tokens: int, max_tokens: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[int], str]]: ) -> list[tuple[list[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, outputs = self.generate(prompts,
greedy_params, greedy_params,
...@@ -919,18 +893,18 @@ class VllmRunner: ...@@ -919,18 +893,18 @@ class VllmRunner:
def generate_greedy_logprobs( def generate_greedy_logprobs(
self, self,
prompts: List[str], prompts: list[str],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
num_prompt_logprobs: Optional[int] = None, num_prompt_logprobs: Optional[int] = None,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
stop_token_ids: Optional[List[int]] = None, stop_token_ids: Optional[list[int]] = None,
stop: Optional[List[str]] = None, stop: Optional[list[str]] = None,
**kwargs: Any, **kwargs: Any,
) -> Union[List[TokensTextLogprobs], ) -> Union[list[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]: list[TokensTextLogprobsPromptLogprobs]]:
greedy_logprobs_params = SamplingParams( greedy_logprobs_params = SamplingParams(
temperature=0.0, temperature=0.0,
max_tokens=max_tokens, max_tokens=max_tokens,
...@@ -948,12 +922,12 @@ class VllmRunner: ...@@ -948,12 +922,12 @@ class VllmRunner:
def generate_encoder_decoder_greedy_logprobs( def generate_encoder_decoder_greedy_logprobs(
self, self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
num_prompt_logprobs: Optional[int] = None, num_prompt_logprobs: Optional[int] = None,
) -> Union[List[TokensTextLogprobs], ) -> Union[list[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]: list[TokensTextLogprobsPromptLogprobs]]:
greedy_logprobs_params = SamplingParams( greedy_logprobs_params = SamplingParams(
temperature=0.0, temperature=0.0,
max_tokens=max_tokens, max_tokens=max_tokens,
...@@ -969,10 +943,10 @@ class VllmRunner: ...@@ -969,10 +943,10 @@ class VllmRunner:
def generate_beam_search( def generate_beam_search(
self, self,
prompts: Union[List[str], List[List[int]]], prompts: Union[list[str], list[list[int]]],
beam_width: int, beam_width: int,
max_tokens: int, max_tokens: int,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> list[tuple[list[list[int]], list[str]]]:
if is_list_of(prompts, str, check="all"): if is_list_of(prompts, str, check="all"):
prompts = [TextPrompt(prompt=prompt) for prompt in prompts] prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
else: else:
...@@ -989,17 +963,17 @@ class VllmRunner: ...@@ -989,17 +963,17 @@ class VllmRunner:
returned_outputs.append((token_ids, texts)) returned_outputs.append((token_ids, texts))
return returned_outputs return returned_outputs
def classify(self, prompts: List[str]) -> List[List[float]]: def classify(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.model.classify(prompts) req_outputs = self.model.classify(prompts)
return [req_output.outputs.probs for req_output in req_outputs] return [req_output.outputs.probs for req_output in req_outputs]
def encode( def encode(
self, self,
prompts: List[str], prompts: list[str],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
) -> List[List[float]]: ) -> list[list[float]]:
inputs = self.get_inputs(prompts, inputs = self.get_inputs(prompts,
images=images, images=images,
videos=videos, videos=videos,
...@@ -1010,9 +984,9 @@ class VllmRunner: ...@@ -1010,9 +984,9 @@ class VllmRunner:
def score( def score(
self, self,
text_1: Union[str, List[str]], text_1: Union[str, list[str]],
text_2: Union[str, List[str]], text_2: Union[str, list[str]],
) -> List[float]: ) -> list[float]:
req_outputs = self.model.score(text_1, text_2) req_outputs = self.model.score(text_1, text_2)
return [req_output.outputs.score for req_output in req_outputs] return [req_output.outputs.score for req_output in req_outputs]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment