Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
......@@ -2,6 +2,7 @@
import ctypes
import importlib.util
import json
import logging
import os
import re
......@@ -9,13 +10,11 @@ import subprocess
import sys
from pathlib import Path
from shutil import which
from typing import Dict, List
import torch
from packaging.version import Version, parse
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
from typing import Optional, Union
......@@ -35,7 +34,7 @@ def load_module_from_path(module_name, path):
return module
ROOT_DIR = os.path.dirname(__file__)
ROOT_DIR = Path(__file__).parent
logger = logging.getLogger(__name__)
# cannot import envs directly because it depends on vllm,
......@@ -62,7 +61,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
# fallback to cpu
VLLM_TARGET_DEVICE = "cpu"
MAIN_CUDA_VERSION = "12.1"
MAIN_CUDA_VERSION = "12.4"
def is_sccache_available() -> bool:
......@@ -77,6 +76,18 @@ def is_ninja_available() -> bool:
return which("ninja") is not None
def is_url_available(url: str) -> bool:
from urllib.request import urlopen
status = None
try:
with urlopen(url) as f:
status = f.status
except Exception:
return False
return status == 200
class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
......@@ -86,7 +97,7 @@ class CMakeExtension(Extension):
class cmake_build_ext(build_ext):
# A dict of extension directories that have been configured.
did_config: Dict[str, bool] = {}
did_config: dict[str, bool] = {}
#
# Determine number of compilation jobs and optionally nvcc compile threads.
......@@ -278,16 +289,43 @@ class repackage_wheel(build_ext):
"""Extracts libraries and other files from an existing wheel."""
def get_base_commit_in_main_branch(self) -> str:
import subprocess
# Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
try:
# Get the latest commit hash of the upstream main branch.
resp_json = subprocess.check_output([
"curl", "-s",
"https://api.github.com/repos/vllm-project/vllm/commits/main"
]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
# Check if the upstream_main_commit exists in the local repo
try:
subprocess.check_output(
["git", "cat-file", "-e", f"{upstream_main_commit}"])
except subprocess.CalledProcessError:
# If not present, fetch it from the remote repository.
# Note that this does not update any local branches,
# but ensures that this commit ref and its history are
# available in our local repo.
subprocess.check_call([
"git", "fetch", "https://github.com/vllm-project/vllm",
"main"
])
# Then get the commit hash of the current branch that is the same as
# the upstream main commit.
current_branch = subprocess.check_output(
["git", "branch", "--show-current"]).decode("utf-8").strip()
base_commit = subprocess.check_output(
["git", "merge-base", "main",
current_branch]).decode("utf-8").strip()
base_commit = subprocess.check_output([
"git", "merge-base", f"{upstream_main_commit}", current_branch
]).decode("utf-8").strip()
return base_commit
except ValueError as err:
raise ValueError(err) from None
except Exception as err:
logger.warning(
"Failed to get the base commit in the main branch. "
......@@ -303,6 +341,10 @@ class repackage_wheel(build_ext):
if wheel_location is None:
base_commit = self.get_base_commit_in_main_branch()
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if not is_url_available(wheel_location):
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import zipfile
......@@ -336,6 +378,7 @@ class repackage_wheel(build_ext):
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
......@@ -413,10 +456,6 @@ def _is_cpu() -> bool:
return VLLM_TARGET_DEVICE == "cpu"
def _is_openvino() -> bool:
return VLLM_TARGET_DEVICE == "openvino"
def _is_xpu() -> bool:
return VLLM_TARGET_DEVICE == "xpu"
......@@ -486,10 +525,6 @@ def get_nvcc_cuda_version() -> Version:
return nvcc_cuda_version
def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath)
def get_sha(root: Union[str, Path]) -> str:
try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
......@@ -525,9 +560,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f"""
try:
__version__ = "0.7.3"
__version_tuple__ = (0, 7, 3)
__hcu_version__ = f'0.7.3+{version}'
__version__ = "0.8.2"
__version_tuple__ = (0, 8, 2)
__hcu_version__ = f'0.8.2+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
......@@ -571,6 +606,7 @@ def get_gaudi_sw_version():
def get_vllm_version() -> str:
if not _is_hip():
from setuptools_scm import get_version
version = get_version(write_to="vllm/_version.py")
sep = "+" if "+" not in version else "." # dev versions might contain +
......@@ -605,8 +641,6 @@ def get_vllm_version() -> str:
if gaudi_sw_version != MAIN_CUDA_VERSION:
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
version += f"{sep}gaudi{gaudi_sw_version}"
elif _is_openvino():
version += f"{sep}openvino"
elif _is_tpu():
version += f"{sep}tpu"
elif _is_cpu():
......@@ -620,11 +654,12 @@ def get_vllm_version() -> str:
return version
def get_requirements() -> List[str]:
def get_requirements() -> list[str]:
"""Get Python package dependencies from requirements.txt."""
requirements_dir = ROOT_DIR / "requirements"
def _read_requirements(filename: str) -> List[str]:
with open(get_path(filename)) as f:
def _read_requirements(filename: str) -> list[str]:
with open(requirements_dir / filename) as f:
requirements = f.read().strip().split("\n")
resolved_requirements = []
for line in requirements:
......@@ -637,37 +672,34 @@ def get_requirements() -> List[str]:
return resolved_requirements
if _no_device():
requirements = _read_requirements("requirements-common.txt")
requirements = _read_requirements("common.txt")
elif _is_cuda():
requirements = _read_requirements("requirements-cuda.txt")
requirements = _read_requirements("cuda.txt")
cuda_major, cuda_minor = torch.version.cuda.split(".")
modified_requirements = []
for req in requirements:
if ("vllm-flash-attn" in req
and not (cuda_major == "12" and cuda_minor == "1")):
# vllm-flash-attn is built only for CUDA 12.1.
if ("vllm-flash-attn" in req and cuda_major != "12"):
# vllm-flash-attn is built only for CUDA 12.x.
# Skip for other versions.
continue
modified_requirements.append(req)
requirements = modified_requirements
elif _is_hip():
requirements = _read_requirements("requirements-rocm.txt")
requirements = _read_requirements("rocm.txt")
elif _is_neuron():
requirements = _read_requirements("requirements-neuron.txt")
requirements = _read_requirements("neuron.txt")
elif _is_hpu():
requirements = _read_requirements("requirements-hpu.txt")
elif _is_openvino():
requirements = _read_requirements("requirements-openvino.txt")
requirements = _read_requirements("hpu.txt")
elif _is_tpu():
requirements = _read_requirements("requirements-tpu.txt")
requirements = _read_requirements("tpu.txt")
elif _is_cpu():
requirements = _read_requirements("requirements-cpu.txt")
requirements = _read_requirements("cpu.txt")
elif _is_xpu():
requirements = _read_requirements("requirements-xpu.txt")
requirements = _read_requirements("xpu.txt")
else:
raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
"OpenVINO, or CPU.")
"or CPU.")
return requirements
......@@ -685,6 +717,11 @@ if _is_cuda():
# FA3 requires CUDA 12.0 or later
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
# Optional since this doesn't get built (produce an .so file) when
# not targeting a hopper system
ext_modules.append(
CMakeExtension(name="vllm._flashmla_C", optional=True))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
if _build_custom_ops():
......@@ -719,6 +756,7 @@ setup(
install_requires=get_requirements(),
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile"], # Required for audio processing
"video": ["decord"] # Required for video processing
......
# SPDX-License-Identifier: Apache-2.0
"""vllm.entrypoints.api_server with some extra logging for testing."""
from typing import Any, Dict, Iterable
from collections.abc import Iterable
from typing import Any
import uvicorn
from fastapi.responses import JSONResponse, Response
......@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
self._num_aborts += len(ids)
await super()._engine_abort(ids)
def testing_stats(self) -> Dict[str, Any]:
def testing_stats(self) -> dict[str, Any]:
return {"num_aborted_requests": self._num_aborts}
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
# SPDX-License-Identifier: Apache-2.0
import os
import subprocess
import sys
import time
......@@ -46,7 +47,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
distributed_executor_backend,
]
uvicorn_process = subprocess.Popen(commands)
# API Server Test Requires V0.
my_env = os.environ.copy()
my_env["VLLM_USE_V1"] = "0"
uvicorn_process = subprocess.Popen(commands, env=my_env)
yield
uvicorn_process.terminate()
......
......@@ -6,7 +6,7 @@ import uuid
from asyncio import CancelledError
from copy import copy
from dataclasses import dataclass
from typing import List, Optional
from typing import Optional
import pytest
import pytest_asyncio
......@@ -152,6 +152,10 @@ def uid() -> str:
@pytest_asyncio.fixture(scope="module")
async def async_engine():
# We cannot use monkeypatch since this is a module
# scoped fixture and monkeypatch is function scoped.
previous_value = os.getenv("VLLM_USE_V1", None)
os.environ["VLLM_USE_V1"] = "0"
engine = await asyncio.get_event_loop().run_in_executor(executor=None,
func=start_engine)
try:
......@@ -162,6 +166,11 @@ async def async_engine():
await asyncio.sleep(0.1)
cleanup_dist_env_and_memory()
if previous_value:
os.environ["VLLM_USE_V1"] = previous_value
else:
del os.environ["VLLM_USE_V1"]
@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
......@@ -255,7 +264,7 @@ async def test_output_kinds(async_engine, stop):
params.output_kind = RequestOutputKind.DELTA
prompt_tokens = None
output_tokens: List[int] = []
output_tokens: list[int] = []
output_text = ""
output_count = 0
final_output = None
......
......@@ -9,7 +9,6 @@ import weakref
import pytest
from vllm import LLM
from vllm.config import LoadFormat
from vllm.platforms import current_platform
from ..conftest import VllmRunner
......@@ -36,7 +35,7 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), load_format=LoadFormat.RUNAI_STREAMER)
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
......@@ -45,12 +44,12 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
@pytest.mark.parametrize("enforce_eager", [False])
def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
model: str,
backend: str,
......@@ -63,51 +62,54 @@ def test_models(
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend in ("XFORMERS",
"FLASHINFER") and model == "google/gemma-2-2b-it":
"FLASHINFER") and model == os.path.join(models_path_prefix, "google/gemma-2-2b-it"):
pytest.skip(
f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:"
example_prompts = [prompt]
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:"
example_prompts = [prompt]
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
# @multi_gpu_test(num_gpus=2)
# @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, "
# "test_suite", [
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "L4"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "L4"),
# (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "ray", "", "L4"),
# (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "mp", "", "L4"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "A100"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "A100"),
# (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "FLASHINFER", "A100"),
# (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray", "FLASHINFER", "A100"),
# ("distilbert/distilgpt2", "ray", "", "L4"),
# ("distilbert/distilgpt2", "mp", "", "L4"),
# ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
# ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
# ("distilbert/distilgpt2", "ray", "", "A100"),
# ("distilbert/distilgpt2", "mp", "", "A100"),
# ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
# ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
# ])
# def test_models_distributed(
# monkeypatch: pytest.MonkeyPatch,
# hf_runner,
# vllm_runner,
# example_prompts,
......@@ -120,35 +122,41 @@ def test_models(
# if test_suite != TARGET_TEST_SUITE:
# pytest.skip(f"Skip test for {test_suite}")
# if model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct") and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# # test ray adag
# os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
# os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
# if attention_backend:
# os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
# dtype = "half"
# max_tokens = 5
# # NOTE: take care of the order. run vLLM first, and then run HF.
# # vLLM needs a fresh new process without cuda initialization.
# # if we run HF first, the cuda initialization will be done and it
# # will hurt multiprocessing backend with fork method (the default method).
# with vllm_runner(model,
# dtype=dtype,
# tensor_parallel_size=2,
# distributed_executor_backend=distributed_executor_backend
# ) as vllm_model:
# vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# with hf_runner(model, dtype=dtype) as hf_model:
# hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
# check_outputs_equal(
# outputs_0_lst=hf_outputs,
# outputs_1_lst=vllm_outputs,
# name_0="hf",
# name_1="vllm",
# )
# with monkeypatch.context() as monkeypatch_context:
# if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# # test Ray Compiled Graph
# monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
# monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
# if attention_backend:
# monkeypatch_context.setenv(
# "VLLM_ATTENTION_BACKEND",
# attention_backend,
# )
# dtype = "half"
# max_tokens = 5
# # NOTE: take care of the order. run vLLM first, and then run HF.
# # vLLM needs a fresh new process without cuda initialization.
# # if we run HF first, the cuda initialization will be done and it
# # will hurt multiprocessing backend with fork method
# # (the default method).
# with vllm_runner(
# model,
# dtype=dtype,
# tensor_parallel_size=2,
# distributed_executor_backend=distributed_executor_backend,
# ) as vllm_model:
# vllm_outputs = vllm_model.generate_greedy(example_prompts,
# max_tokens)
# with hf_runner(model, dtype=dtype) as hf_model:
# hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
# check_outputs_equal(
# outputs_0_lst=hf_outputs,
# outputs_1_lst=vllm_outputs,
# name_0="hf",
# name_1="vllm",
# )
......@@ -7,24 +7,41 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file.
"""
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
......@@ -35,8 +52,8 @@ MODELS = [
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models(
hf_runner,
vllm_runner,
hf_runner: HfRunner,
vllm_runner: VllmRunner,
example_prompts,
model: str,
dtype: str,
......@@ -45,37 +62,39 @@ def test_models(
enforce_eager: bool,
tensor_parallel_size: int,
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
override_backend_env_variable(monkeypatch, attention_backend)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@multi_gpu_test(num_gpus=2)
......@@ -83,57 +102,61 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed(
hf_runner,
vllm_runner,
hf_runner: HfRunner,
vllm_runner: VllmRunner,
example_prompts,
model: str,
distributed_executor_backend: str,
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
override_backend_env_variable(monkeypatch, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
dtype = "half"
max_tokens = 5
chunked_prefill_token_size = 16
# Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test Ray Compiled Graph
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
dtype = "half"
max_tokens = 5
chunked_prefill_token_size = 16
# Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with
# fork method (the default method).
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
# @pytest.mark.parametrize(
......@@ -151,7 +174,7 @@ def test_models_distributed(
# # the async postprocessor
# @pytest.mark.parametrize("disable_async_output_proc", [True])
# def test_models_with_fp8_kv_cache(
# vllm_runner,
# vllm_runner: VllmRunner,
# example_prompts,
# kv_cache_dtype: str,
# model: str,
......@@ -211,7 +234,7 @@ def test_models_distributed(
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching(
vllm_runner,
vllm_runner: VllmRunner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
......@@ -247,8 +270,10 @@ def test_with_prefix_caching(
) as vllm_model:
outputs[enable] = []
for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt],
max_tokens)
outputs[enable] += vllm_model.generate_greedy(
[prompt],
max_tokens,
)
check_outputs_equal(
outputs_0_lst=outputs[False],
......@@ -259,7 +284,7 @@ def test_with_prefix_caching(
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False])
......@@ -267,8 +292,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu(
hf_runner,
vllm_runner,
hf_runner: HfRunner,
vllm_runner: VllmRunner,
example_prompts,
model: str,
dtype: str,
......@@ -276,7 +301,7 @@ def test_models_cpu(
chunked_prefill_token_size: int,
enforce_eager: bool,
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
test_models(
hf_runner,
......@@ -296,11 +321,11 @@ def test_models_cpu(
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu(
vllm_runner,
vllm_runner: VllmRunner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from ..utils import compare_two_settings, models_path_prefix
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
monkeypatch.setenv('VLLM_USE_V1', '0')
def test_cpu_offload():
compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), [],
["--cpu-offload-gb", "1"])
......@@ -4,15 +4,13 @@ import pytest
import torch
from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test
from ..utils import create_new_process_for_each_test
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_python_error():
"""
Test if Python error occurs when there's low-level
......@@ -38,7 +36,7 @@ def test_python_error():
allocator.wake_up()
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_basic_cumem():
# some tensors from default memory pool
shape = (1024, 1024)
......@@ -71,7 +69,7 @@ def test_basic_cumem():
assert torch.allclose(output, torch.ones_like(output) * 3)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_cumem_with_cudagraph():
allocator = CuMemAllocator.get_instance()
with allocator.use_memory_pool():
......@@ -116,43 +114,47 @@ def test_cumem_with_cudagraph():
assert torch.allclose(y, x + 1)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
@pytest.mark.parametrize(
"model, use_v1",
[
# sleep mode with safetensors
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
("meta-llama/Llama-3.2-1B", True),
# sleep mode with pytorch checkpoint
("facebook/opt-125m", False),
])
def test_end_to_end(model: str, use_v1: bool):
import os
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
load_format = LoadFormat.AUTO
if "Llama" in model:
load_format = LoadFormat.RUNAI_STREAMER
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
assert used_bytes < 2 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
if use_v1:
assert used_bytes < 7 * GiB_bytes
else:
assert used_bytes < 2 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
......@@ -24,6 +24,15 @@ MODELS = [
]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
so use VLLM_USE_V1=0 for all tests in the file.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(scope="module", autouse=True)
def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
......
# SPDX-License-Identifier: Apache-2.0
import Cython.Compiler.Options
from Cython.Build import cythonize
from setuptools import setup
Cython.Compiler.Options.annotate = True
infiles = []
infiles += [
"vllm/engine/llm_engine.py",
"vllm/transformers_utils/detokenizer.py",
"vllm/engine/output_processor/single_step.py",
"vllm/outputs.py",
"vllm/engine/output_processor/stop_checker.py",
]
infiles += [
"vllm/core/scheduler.py",
"vllm/sequence.py",
"vllm/core/block_manager.py",
]
infiles += [
"vllm/model_executor/layers/sampler.py",
"vllm/sampling_params.py",
"vllm/utils.py",
]
setup(ext_modules=cythonize(infiles,
annotate=False,
force=True,
compiler_directives={
'language_level': "3",
'infer_types': True
}))
# example usage: python3 build_cython.py build_ext --inplace
......@@ -6,6 +6,7 @@ from typing import Callable, Union
from torch import fx
from vllm.compilation.inductor_pass import InductorPass
from vllm.config import get_current_vllm_config
class TestBackend:
......@@ -13,21 +14,27 @@ class TestBackend:
This class provides a simple Inductor backend that can be used for testing.
It takes a list of custom passes and runs them after Inductor's passes.
It also saves the graph before and after the custom passes for inspection.
Inductor config can be modified directly by editing the inductor_config
property. This can be helpful for adding passes like the
'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
Inductor config is default-initialized from VllmConfig.CompilationConfig.
"""
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
None]]):
self.custom_passes = list(passes)
from torch._inductor import config
self.current_config = config.shallow_copy_dict()
self.current_config['force_disable_caches'] = True
self.current_config['post_grad_custom_post_pass'] = self.post_pass
compile_config = get_current_vllm_config().compilation_config
self.inductor_config = compile_config.inductor_compile_config
self.inductor_config['force_disable_caches'] = True
self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
def __call__(self, graph: fx.GraphModule, example_inputs):
self.graph_pre_compile = deepcopy(graph)
from torch._inductor.compile_fx import compile_fx
return compile_fx(graph,
example_inputs,
config_patches=self.current_config)
config_patches=self.inductor_config)
def post_pass(self, graph: fx.Graph):
self.graph_pre_pass = deepcopy(graph)
......
# SPDX-License-Identifier: Apache-2.0
import pytest
# TEST V1: this should be removed. Right now V1 overrides
# all the torch compile logic. We should re-enable this
# as we add torch compile support back to V1.
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
......@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed.
"""
from dataclasses import dataclass
from typing import Any, List, Optional, Tuple
from typing import Any, Optional
import torch
from torch import nn
......@@ -56,7 +56,7 @@ class LlamaConfig:
random_seed: int = 0
def compute_hash(self) -> str:
factors: List[Any] = []
factors: list[Any] = []
for k, v in self.__dict__.items():
if k == "random_seed":
continue
......@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module):
positions: torch.Tensor,
hidden_states: torch.Tensor,
residual: Optional[torch.Tensor],
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor]:
"""
For tractable computation:
- if residual is None, the outputs are:
......
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses
from typing import Dict, List, Optional
import pytest
import os
......@@ -15,7 +15,7 @@ from ..utils import compare_all_settings, models_path_prefix
@dataclasses.dataclass
class TestSetting:
model: str
model_args: List[str]
model_args: list[str]
pp_size: int
tp_size: int
attn_backend: str
......@@ -23,75 +23,76 @@ class TestSetting:
fullgraph: bool
# representative settings for testing
test_settings = [
# basic llama model
TestSetting(
model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
]
# we cannot afford testing the full Catesian product
# of all models and all levels
@pytest.mark.parametrize("test_setting", test_settings)
def test_compile_correctness(test_setting: TestSetting):
@pytest.mark.parametrize(
"test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed", "--dtype", "bfloat16"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
......@@ -104,41 +105,45 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
["-tp", str(tp_size)]
all_args: List[List[str]] = []
all_envs: List[Optional[Dict[str, str]]] = []
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.PIECEWISE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.PIECEWISE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close")
all_envs.clear()
all_args.clear()
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close")
all_envs.clear()
all_args.clear()
for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
# "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
# "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method)
compare_all_settings(model, all_args * 3, all_envs, method=method)
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import Any
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test
@pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize(
"optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
@fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level):
model = model_info[0]
model_kwargs = model_info[1]
check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1)
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
)
@pytest.mark.parametrize("model_info", "", indirect=True)
@create_new_process_for_each_test()
def test_full_graph(
monkeypatch: pytest.MonkeyPatch,
model_info: tuple[str, dict[str, Any]],
optimization_level: int,
):
model, model_kwargs = model_info
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
......@@ -10,7 +10,7 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
kFp8DynamicTokenSym, kFp8StaticTensorSym)
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
from vllm.compilation.reshapes import RedundantReshapesPass
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig
from .backend import TestBackend
......@@ -52,11 +52,11 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
torch.set_default_device("cuda")
config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
enable_reshape=True)
reshape_pass = RedundantReshapesPass(config)
enable_noop=True)
noop_pass = NoOpEliminationPass(config)
fusion_pass = FusionPass.instance(config)
passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass]
passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
func_pass = FixFunctionalizationPass(config)
backend_func = TestBackend(*passes, func_pass)
backend_no_func = TestBackend(*passes)
......
......@@ -5,23 +5,25 @@ import torch
from compressed_tensors.quantization import FP8_DTYPE
import vllm.envs as envs
import vllm.plugins
from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
FusionPass, QuantKey)
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
from vllm.compilation.reshapes import RedundantReshapesPass
from vllm.config import CompilationConfig
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
apply_fp8_linear)
CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
from .backend import TestBackend
class TestModel(torch.nn.Module):
def __init__(self, hidden_size: int, eps: float, static: bool, *args,
**kwargs):
def __init__(self, hidden_size: int, eps: float, static: bool,
cutlass_fp8_enabled: bool, *args, **kwargs):
super().__init__(*args, **kwargs)
self.cutlass_fp8_enabled = cutlass_fp8_enabled
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
if static:
......@@ -32,24 +34,20 @@ class TestModel(torch.nn.Module):
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
for _ in range(2)
]
self.fp8_linear = Fp8LinearOp(
cutlass_fp8_supported=cutlass_fp8_enabled,
use_per_token_if_dynamic=True)
def forward(self, x):
resid = torch.sqrt(x)
y = self.norm[0](x)
x2 = apply_fp8_linear(y,
self.w[0],
self.wscale[0],
self.scale[0],
use_per_token_if_dynamic=True)
x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
# make sure resid is used for replacement to work
y2, resid = self.norm[1](x2, resid)
x3 = apply_fp8_linear(y2,
self.w[1],
self.wscale[1],
self.scale[1],
use_per_token_if_dynamic=True)
x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
self.scale[1])
y3, resid = self.norm[2](x3, resid) # use resid here
return y3
......@@ -59,60 +57,67 @@ class TestModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("static", [True, False])
@pytest.mark.parametrize("cutlass_fp8_enabled",
[True, False] if CUTLASS_FP8_SUPPORTED else [False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
reason="Only test on CUDA")
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
cutlass_fp8_enabled):
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity() # needed for certain non-cutlass fp8 paths
# Reshape pass is needed for the fusion pass to work
config = CompilationConfig.PassConfig(enable_fusion=True,
enable_reshape=True)
reshape_pass = RedundantReshapesPass(config)
fusion_pass = FusionPass.instance(config)
backend = TestBackend(reshape_pass, fusion_pass)
model = TestModel(hidden_size, eps, static)
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size)
torch._dynamo.mark_dynamic(x, 0)
result = model(x)
model2 = torch.compile(model, backend=backend)
result2 = model2(x)
# Higher tol for dynamic, even higher for bfloat16
if static:
ATOL, RTOL = (1e-3, 1e-3)
elif dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
# Check substitution worked
pre_nodes = backend.graph_pre_pass.nodes
post_nodes = backend.graph_post_pass.nodes
# static is per-tensor, dynamic is per-token
key = QuantKey(dtype=FP8_DTYPE,
static=static,
per_tensor=static,
symmetric=True)
rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
fp8_quant = QUANT_OPS[key]
# In pre-nodes, fp8 quant should be present and fused kernels should not
assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
find_auto_fn(pre_nodes, fp8_quant)
# In post-nodes, fused kernels should be present and fp8 quant should not
find_auto_fn(post_nodes, rms_quant)
find_auto_fn(post_nodes, add_rms_quant)
assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
vllm_config = VllmConfig(compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
with vllm.config.set_current_vllm_config(vllm_config):
# Reshape pass is needed for the fusion pass to work
config = CompilationConfig.PassConfig(enable_fusion=True,
enable_noop=True)
noop_pass = NoOpEliminationPass(config)
fusion_pass = FusionPass.instance(config)
backend = TestBackend(noop_pass, fusion_pass)
model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size)
torch._dynamo.mark_dynamic(x, 0)
result = model(x)
model2 = torch.compile(model, backend=backend)
result2 = model2(x)
# Higher tol for dynamic, even higher for bfloat16
if static:
ATOL, RTOL = (1e-3, 1e-3)
elif dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
# Check substitution worked
pre_nodes = backend.graph_pre_pass.nodes
post_nodes = backend.graph_post_pass.nodes
# static is per-tensor, dynamic is per-token
key = QuantKey(dtype=FP8_DTYPE,
static=static,
per_tensor=static,
symmetric=True)
rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
fp8_quant = QUANT_OPS[key]
# In pre-nodes, fp8 quant should be there and fused kernels should not
assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
find_auto_fn(pre_nodes, fp8_quant)
# In post-nodes, fused kernels should be there and fp8 quant should not
find_auto_fn(post_nodes, rms_quant)
find_auto_fn(post_nodes, add_rms_quant)
assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
# SPDX-License-Identifier: Apache-2.0
import pickle
import copy
import pytest
import torch
from torch._inductor.codecache import BypassFxGraphCache
from vllm.compilation.config import CompilationConfig
from vllm.compilation.inductor_pass import (CallableInductorPass,
as_inductor_pass)
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.compilation.pass_manager import PostGradPassManager
from vllm.config import CompilationConfig
# dummy custom pass that doesn't inherit
def simple_callable(graph: torch.fx.Graph):
pass
@as_inductor_pass(files=(__file__, ))
def callable_decorated(graph: torch.fx.Graph):
pass
# Should fail to add directly to the pass manager
def test_bad_callable():
config = CompilationConfig().pass_config
pass_manager = PostGradPassManager()
pass_manager.configure(config)
with pytest.raises(AssertionError):
pass_manager.add(simple_callable) # noqa, type wrong on purpose
# Pass that inherits from InductorPass
class ProperPass(InductorPass):
def __call__(self, graph: torch.fx.graph.Graph) -> None:
pass
@pytest.mark.parametrize(
"works, callable",
[(False, simple_callable), (True, callable_decorated),
(True, CallableInductorPass(simple_callable, "simple_callable"))])
def test_pass_manager(works: bool, callable):
"callable",
[
ProperPass(),
# Can also wrap callables in CallableInductorPass for compliance
CallableInductorPass(simple_callable),
CallableInductorPass(simple_callable,
InductorPass.hash_source(__file__))
],
)
def test_pass_manager_uuid(callable):
config = CompilationConfig().pass_config
pass_manager = PostGradPassManager([callable])
pass_manager.configure(config) # Adds default passes
if works:
pickle.dumps(pass_manager)
else:
with pytest.raises(BypassFxGraphCache):
pickle.dumps(pass_manager)
pass_manager = PostGradPassManager()
pass_manager.configure(config)
# Check that UUID is different if the same pass is added 2x
pass_manager.add(callable)
uuid1 = pass_manager.uuid()
pass_manager.add(callable)
uuid2 = pass_manager.uuid()
assert uuid1 != uuid2
# UUID should be the same as the original one,
# as we constructed in the same way.
pass_manager2 = PostGradPassManager()
pass_manager2.configure(config)
pass_manager2.add(callable)
assert uuid1 == pass_manager2.uuid()
# UUID should be different due to config change
config2 = copy.deepcopy(config)
config2.enable_fusion = not config2.enable_fusion
pass_manager3 = PostGradPassManager()
pass_manager3.configure(config2)
pass_manager3.add(callable)
assert uuid1 != pass_manager3.uuid()
......@@ -5,8 +5,8 @@ import os
import tempfile
from collections import UserList
from enum import Enum
from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
TypedDict, TypeVar, Union)
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
import pytest
import pytest_html
......@@ -17,8 +17,8 @@ import torch.nn as nn
import torch.nn.functional as F
from huggingface_hub import snapshot_download
from PIL import Image
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
BatchFeature)
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
BatchEncoding, BatchFeature)
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from tests.models.utils import (TokensTextLogprobs,
......@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
......@@ -37,8 +37,8 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_list_of)
from vllm.utils import cuda_device_count_stateless, is_list_of
from .utils import models_path_prefix
logger = init_logger(__name__)
......@@ -50,78 +50,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M = TypeVar("_M")
MODELS_ON_S3 = [
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"ai21labs/Jamba-tiny-random",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
]
MODEL_WEIGHTS_S3_BUCKET = models_path_prefix
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def _read_prompts(filename: str) -> List[str]:
def _read_prompts(filename: str) -> list[str]:
with open(filename) as f:
prompts = f.readlines()
return prompts
......@@ -144,7 +80,7 @@ class _ImageAssets(_ImageAssetsBase):
ImageAsset("cherry_blossom"),
])
def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
"""
Convenience method to define the prompt for each test image.
......@@ -169,7 +105,7 @@ class _VideoAssets(_VideoAssetsBase):
VideoAsset("sample_demo_1.mp4"),
])
def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
return [prompts["sample_demo_1"]]
......@@ -179,6 +115,26 @@ VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`."""
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
"""
The V1 oracle sets "VLLM_USE_V1" during loading. This means
that each invocation of a test change the env variable.
If we touch "VLLM_USE_V1" with monkeypatch, then any changes
made during the test run by vLLM will be cleaned up.
This fixture is used by every test.
"""
# If VLLM_USE_V1 is not set, set then delete. This will
# cause monkeypatch to clean up VLLM_USE_V1 upon exit
# if VLLM modifies the value of envs.VLLM_USE_V1.
if "VLLM_USE_V1" not in os.environ:
monkeypatch.setenv("VLLM_USE_V1", "")
monkeypatch.delenv("VLLM_USE_V1")
@pytest.fixture(params=[True, False])
def run_with_both_engines(request, monkeypatch):
# Automatically runs tests twice, once with V1 and once without
......@@ -242,7 +198,7 @@ def dynamo_reset():
@pytest.fixture
def example_prompts() -> List[str]:
def example_prompts() -> list[str]:
prompts = []
for filename in _TEST_PROMPTS:
prompts += _read_prompts(filename)
......@@ -264,7 +220,7 @@ class DecoderPromptType(Enum):
@pytest.fixture
def example_encoder_decoder_prompts(
) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
......@@ -296,7 +252,7 @@ def example_encoder_decoder_prompts(
@pytest.fixture
def example_long_prompts() -> List[str]:
def example_long_prompts() -> list[str]:
prompts = []
for filename in _LONG_PROMPTS:
prompts += _read_prompts(filename)
......@@ -319,13 +275,17 @@ _R = TypeVar("_R")
class HfRunner:
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
def get_default_device(self):
from vllm.platforms import current_platform
return ("cpu" if current_platform.is_cpu() else "cuda")
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
if x is None or isinstance(x, (bool, )):
return x
if device is None:
device = "cpu" if current_platform.is_cpu() else "cuda"
device = self.device
if isinstance(x, dict):
return {k: self.wrap_device(v, device) for k, v in x.items()}
......@@ -338,45 +298,59 @@ class HfRunner:
def __init__(
self,
model_name: str,
dtype: str = "half",
dtype: str = "auto",
*,
model_kwargs: Optional[Dict[str, Any]] = None,
model_kwargs: Optional[dict[str, Any]] = None,
is_sentence_transformer: bool = False,
is_cross_encoder: bool = False,
skip_tokenizer_init: bool = False,
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
postprocess_inputs: Callable[..., BatchEncoding] = identity,
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
) -> None:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
self.model_name = model_name
self.config = AutoConfig.from_pretrained(
model_name,
trust_remote_code=True,
)
self.device = self.get_default_device()
self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
model_kwargs = model_kwargs if model_kwargs is not None else {}
model_kwargs.setdefault("torch_dtype", torch_dtype)
if is_sentence_transformer:
# Lazy init required for AMD CI
from sentence_transformers import SentenceTransformer
self.model = self.wrap_device(
SentenceTransformer(
model_name,
device="cpu",
trust_remote_code=True,
).to(dtype=torch_dtype))
self.model = SentenceTransformer(
model_name,
device=self.device,
model_kwargs=model_kwargs,
trust_remote_code=True,
)
elif is_cross_encoder:
# Lazy init required for AMD CI
from sentence_transformers import CrossEncoder
self.model = CrossEncoder(model_name,
device="cpu",
trust_remote_code=True)
self.model.model = self.wrap_device(self.model.model)\
.to(dtype=torch_dtype)
self.model = CrossEncoder(
model_name,
device=self.device,
automodel_args=model_kwargs,
trust_remote_code=True,
)
else:
model_kwargs = model_kwargs if model_kwargs is not None else {}
self.model = self.wrap_device(
auto_cls.from_pretrained(
model_name,
torch_dtype=torch_dtype,
trust_remote_code=True,
**model_kwargs,
))
model = auto_cls.from_pretrained(
model_name,
trust_remote_code=True,
**model_kwargs,
)
if (getattr(model, "quantization_method", None) != "bitsandbytes"
and len({p.device
for p in model.parameters()}) < 2):
model = model.to(self.device)
self.model = model
if not skip_tokenizer_init:
self.tokenizer = AutoTokenizer.from_pretrained(
......@@ -396,16 +370,13 @@ class HfRunner:
if skip_tokenizer_init:
self.tokenizer = self.processor.tokenizer
self.dtype = dtype
self.postprocess_inputs = postprocess_inputs
def get_inputs(
self,
prompts: List[str],
prompts: list[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> List[BatchEncoding]:
) -> list[Union[BatchFeature, BatchEncoding]]:
if images is not None:
assert len(prompts) == len(images)
......@@ -415,9 +386,9 @@ class HfRunner:
if audios is not None:
assert len(prompts) == len(audios)
all_inputs: List[BatchEncoding] = []
all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
for i, prompt in enumerate(prompts):
processor_kwargs: Dict[str, Any] = {
processor_kwargs: dict[str, Any] = {
"text": prompt,
"return_tensors": "pt",
}
......@@ -431,13 +402,14 @@ class HfRunner:
processor_kwargs["sampling_rate"] = sr
inputs = self.processor(**processor_kwargs)
inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
if isinstance(inputs, BatchFeature):
inputs = inputs.to(dtype=self.dtype)
all_inputs.append(inputs)
return all_inputs
def classify(self, prompts: List[str]) -> List[str]:
def classify(self, prompts: list[str]) -> list[str]:
# output is final logits
all_inputs = self.get_inputs(prompts)
outputs = []
......@@ -450,21 +422,21 @@ class HfRunner:
def generate(
self,
prompts: List[str],
prompts: list[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]:
) -> list[tuple[list[list[int]], list[str]]]:
all_inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
outputs: List[Tuple[List[List[int]], List[str]]] = []
outputs: list[tuple[list[list[int]], list[str]]] = []
for inputs in all_inputs:
output_ids = self.model.generate(
**self.wrap_device(inputs, device=self.model.device.type),
**self.wrap_device(inputs),
use_cache=True,
**kwargs,
)
......@@ -479,13 +451,13 @@ class HfRunner:
def generate_greedy(
self,
prompts: List[str],
prompts: list[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str]]:
) -> list[tuple[list[int], str]]:
outputs = self.generate(prompts,
do_sample=False,
max_new_tokens=max_tokens,
......@@ -499,10 +471,10 @@ class HfRunner:
def generate_beam_search(
self,
prompts: List[str],
prompts: list[str],
beam_width: int,
max_tokens: int,
) -> List[Tuple[List[List[int]], List[str]]]:
) -> list[tuple[list[list[int]], list[str]]]:
outputs = self.generate(prompts,
do_sample=False,
max_new_tokens=max_tokens,
......@@ -520,22 +492,22 @@ class HfRunner:
def generate_greedy_logprobs(
self,
prompts: List[str],
prompts: list[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[List[torch.Tensor]]:
) -> list[list[torch.Tensor]]:
all_inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
all_logprobs: List[List[torch.Tensor]] = []
all_logprobs: list[list[torch.Tensor]] = []
for inputs in all_inputs:
output = self.model.generate(
**self.wrap_device(inputs, device=self.model.device.type),
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
......@@ -550,11 +522,11 @@ class HfRunner:
def _hidden_states_to_seq_logprobs(
self,
hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
) -> List[torch.Tensor]:
hidden_states: tuple[tuple[torch.Tensor, ...], ...],
) -> list[torch.Tensor]:
output_embeddings = self.model.get_output_embeddings()
seq_logprobs: List[torch.Tensor] = []
seq_logprobs: list[torch.Tensor] = []
for _, hidden_state in enumerate(hidden_states):
last_hidden_states = hidden_state[-1][0]
logits = torch.matmul(
......@@ -570,14 +542,14 @@ class HfRunner:
def _hidden_states_to_logprobs(
self,
hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
hidden_states: tuple[tuple[torch.Tensor, ...], ...],
num_logprobs: int,
) -> Tuple[List[Dict[int, float]], int]:
) -> tuple[list[dict[int, float]], int]:
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
output_len = len(hidden_states)
# convert to dict
seq_logprobs_lst: List[Dict[int, float]] = []
seq_logprobs_lst: list[dict[int, float]] = []
for tok_idx, tok_logprobs in enumerate(seq_logprobs):
# drop prompt logprobs
if tok_idx == 0:
......@@ -597,26 +569,26 @@ class HfRunner:
def generate_greedy_logprobs_limit(
self,
prompts: List[str],
prompts: list[str],
max_tokens: int,
num_logprobs: int,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
**kwargs: Any,
) -> List[TokensTextLogprobs]:
) -> list[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
all_logprobs: list[list[dict[int, float]]] = []
all_output_ids: list[list[int]] = []
all_output_strs: list[str] = []
for inputs in all_inputs:
output = self.model.generate(
**self.wrap_device(inputs, device=self.model.device.type),
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
......@@ -644,51 +616,47 @@ class HfRunner:
def generate_encoder_decoder_greedy_logprobs_limit(
self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int,
num_logprobs: int,
images: Optional[PromptImageInput] = None,
**kwargs: Any,
) -> List[TokensTextLogprobs]:
) -> list[TokensTextLogprobs]:
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
all_logprobs: list[list[dict[int, float]]] = []
all_output_ids: list[list[int]] = []
all_output_strs: list[str] = []
for i, (encoder_prompt, decoder_prompt) in enumerate(
to_enc_dec_tuple_list(encoder_decoder_prompts)):
processor_kwargs: Dict[str, Any] = {
processor_kwargs: dict[str, Any] = {
"text": encoder_prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
encoder_input_ids = self.wrap_device(
self.processor(**processor_kwargs).input_ids,
device=self.model.device.type,
)
encoder_inputs = self.processor(**processor_kwargs)
encoder_inputs = self.wrap_device(encoder_inputs)
if decoder_prompt is None:
decoder_input_ids = None
else:
decoder_input_ids = self.wrap_device(
self.tokenizer(decoder_prompt,
return_tensors="pt").input_ids,
device=self.model.device.type,
)
decoder_inputs = self.tokenizer(decoder_prompt,
return_tensors="pt")
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
output = self.model.generate(
encoder_input_ids,
decoder_input_ids=decoder_input_ids,
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
output_hidden_states=True,
return_dict_in_generate=True,
**encoder_inputs,
**kwargs,
)
......@@ -708,10 +676,10 @@ class HfRunner:
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
return self.model.encode(prompts)
def predict(self, prompts: List[List[str]]) -> torch.Tensor:
def predict(self, prompts: list[list[str]]) -> torch.Tensor:
return self.model.predict(prompts, convert_to_tensor=True)
def __enter__(self):
......@@ -728,6 +696,18 @@ def hf_runner():
class VllmRunner:
"""
The default value of some arguments have been modified from
:class:`~vllm.LLM` as follows:
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
- `seed`: Set to `0` instead of `None` for test reproducibility.
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
- `block_size`: Set to `16` instead of `None` to reduce memory usage.
- `enable_chunked_prefill`: Set to `False` instead of `None` for
test reproducibility.
- `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
"""
def __init__(
self,
......@@ -735,31 +715,26 @@ class VllmRunner:
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
tokenizer_mode: str = "auto",
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
trust_remote_code: bool = True,
seed: Optional[int] = 0,
max_model_len: int = 1024,
dtype: str = "half",
dtype: str = "auto",
disable_log_stats: bool = True,
tensor_parallel_size: int = 1,
block_size: int = 16,
enable_chunked_prefill: bool = False,
enable_chunked_prefill: Optional[bool] = False,
swap_space: int = 4,
enforce_eager: Optional[bool] = False,
load_format: Optional[LoadFormat] = None,
**kwargs,
) -> None:
if model_name in MODELS_ON_S3 and not load_format:
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
load_format = LoadFormat.RUNAI_STREAMER
if not load_format:
load_format = LoadFormat.AUTO
self.model = LLM(
model=model_name,
task=task,
tokenizer=tokenizer_name,
tokenizer_mode=tokenizer_mode,
trust_remote_code=True,
trust_remote_code=trust_remote_code,
dtype=dtype,
seed=seed,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
......@@ -767,17 +742,16 @@ class VllmRunner:
max_model_len=max_model_len,
block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs,
)
def get_inputs(
self,
prompts: List[str],
prompts: list[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> List[TextPrompt]:
) -> list[TextPrompt]:
if images is not None:
assert len(prompts) == len(images)
......@@ -807,13 +781,13 @@ class VllmRunner:
def generate(
self,
prompts: List[str],
prompts: list[str],
sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]:
) -> list[tuple[list[list[int]], list[str]]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
......@@ -823,12 +797,12 @@ class VllmRunner:
sampling_params=sampling_params,
**kwargs)
outputs: List[Tuple[List[List[int]], List[str]]] = []
outputs: list[tuple[list[list[int]], list[str]]] = []
for req_output in req_outputs:
prompt_str = req_output.prompt
prompt_ids = req_output.prompt_token_ids
req_sample_output_ids: List[List[int]] = []
req_sample_output_strs: List[str] = []
req_sample_output_ids: list[list[int]] = []
req_sample_output_strs: list[str] = []
for sample in req_output.outputs:
output_str = sample.text
output_ids = list(sample.token_ids)
......@@ -839,9 +813,9 @@ class VllmRunner:
@staticmethod
def _final_steps_generate_w_logprobs(
req_outputs: List[RequestOutput],
) -> List[TokensTextLogprobsPromptLogprobs]:
outputs: List[TokensTextLogprobsPromptLogprobs] = []
req_outputs: list[RequestOutput],
) -> list[TokensTextLogprobsPromptLogprobs]:
outputs: list[TokensTextLogprobsPromptLogprobs] = []
for req_output in req_outputs:
assert len(req_output.outputs) > 0
for sample in req_output.outputs:
......@@ -854,14 +828,14 @@ class VllmRunner:
def generate_w_logprobs(
self,
prompts: List[str],
prompts: list[str],
sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
**kwargs: Any,
) -> Union[List[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]:
) -> Union[list[TokensTextLogprobs],
list[TokensTextLogprobsPromptLogprobs]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
......@@ -880,10 +854,10 @@ class VllmRunner:
def generate_encoder_decoder_w_logprobs(
self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
sampling_params: SamplingParams,
) -> Union[List[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]:
) -> Union[list[TokensTextLogprobs],
list[TokensTextLogprobsPromptLogprobs]]:
'''
Logprobs generation for vLLM encoder/decoder models
'''
......@@ -900,13 +874,13 @@ class VllmRunner:
def generate_greedy(
self,
prompts: List[str],
prompts: list[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str]]:
) -> list[tuple[list[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts,
greedy_params,
......@@ -919,18 +893,18 @@ class VllmRunner:
def generate_greedy_logprobs(
self,
prompts: List[str],
prompts: list[str],
max_tokens: int,
num_logprobs: int,
num_prompt_logprobs: Optional[int] = None,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
stop_token_ids: Optional[List[int]] = None,
stop: Optional[List[str]] = None,
stop_token_ids: Optional[list[int]] = None,
stop: Optional[list[str]] = None,
**kwargs: Any,
) -> Union[List[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]:
) -> Union[list[TokensTextLogprobs],
list[TokensTextLogprobsPromptLogprobs]]:
greedy_logprobs_params = SamplingParams(
temperature=0.0,
max_tokens=max_tokens,
......@@ -948,12 +922,12 @@ class VllmRunner:
def generate_encoder_decoder_greedy_logprobs(
self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int,
num_logprobs: int,
num_prompt_logprobs: Optional[int] = None,
) -> Union[List[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]:
) -> Union[list[TokensTextLogprobs],
list[TokensTextLogprobsPromptLogprobs]]:
greedy_logprobs_params = SamplingParams(
temperature=0.0,
max_tokens=max_tokens,
......@@ -969,10 +943,10 @@ class VllmRunner:
def generate_beam_search(
self,
prompts: Union[List[str], List[List[int]]],
prompts: Union[list[str], list[list[int]]],
beam_width: int,
max_tokens: int,
) -> List[Tuple[List[List[int]], List[str]]]:
) -> list[tuple[list[list[int]], list[str]]]:
if is_list_of(prompts, str, check="all"):
prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
else:
......@@ -989,17 +963,17 @@ class VllmRunner:
returned_outputs.append((token_ids, texts))
return returned_outputs
def classify(self, prompts: List[str]) -> List[List[float]]:
def classify(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.model.classify(prompts)
return [req_output.outputs.probs for req_output in req_outputs]
def encode(
self,
prompts: List[str],
prompts: list[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> List[List[float]]:
) -> list[list[float]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
......@@ -1010,9 +984,9 @@ class VllmRunner:
def score(
self,
text_1: Union[str, List[str]],
text_2: Union[str, List[str]],
) -> List[float]:
text_1: Union[str, list[str]],
text_2: Union[str, list[str]],
) -> list[float]:
req_outputs = self.model.score(text_1, text_2)
return [req_output.outputs.score for req_output in req_outputs]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment