Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
......@@ -4,6 +4,8 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py # for pynvml package
torch == 2.3.0
xformers == 0.0.26.post1 # Requires PyTorch 2.3.0
vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0
torch == 2.3.1
# These must be updated alongside torch
torchvision == 0.18.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27 # Requires PyTorch 2.3.1
vllm-flash-attn == 2.5.9.post1 # Requires PyTorch 2.3.1
# formatting
yapf==0.32.0
toml==0.10.2
tomli==2.0.1
ruff==0.1.5
codespell==2.2.6
isort==5.13.2
clang-format==18.1.5
-r requirements-lint.txt
-r requirements-test.txt
# type checking
mypy==1.9.0
types-PyYAML
types-requests
types-setuptools
# testing
pytest
tensorizer>=2.9.0
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
# testing utils
awscli
einops # required for MPT
httpx
peft
requests
ray
sentence-transformers # required for embedding
# Benchmarking
aiohttp
# quantization
bitsandbytes==0.42.0
# Avoid adding requirements directly to this file.
# Instead, modify the two files referenced above.
# formatting
yapf==0.32.0
toml==0.10.2
tomli==2.0.1
ruff==0.1.5
codespell==2.3.0
isort==5.13.2
clang-format==18.1.5
# type checking
mypy==1.9.0
types-PyYAML
types-requests
types-setuptools
# Mamba dependencies
mamba-ssm>=1.2.2
causal-conv1d>=1.2.0
# Common dependencies
-r requirements-common.txt
# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
optimum-intel[openvino] >= 1.18.1
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
......@@ -2,6 +2,5 @@
-r requirements-common.txt
# Dependencies for AMD GPUs
ray == 2.9.1
# ray >= 2.10.0
ray >= 2.10.0
pytest-asyncio
# testing
pytest
tensorizer>=2.9.0
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
# testing utils
awscli
einops # required for MPT
httpx
peft
requests
ray
sentence-transformers # required for embedding
sparseml==1.8.0 # required for compressed-tensors
compressed-tensors==0.4.0 # required for compressed-tensors
# Benchmarking
aiohttp
# quantization
bitsandbytes==0.42.0
\ No newline at end of file
# Common dependencies
-r requirements-common.txt
# Dependencies for TPU
# Currently, the TPU backend uses a nightly version of PyTorch XLA.
# You can install the dependencies in Dockerfile.tpu.
triton # To avoid import errors
# Common dependencies
-r requirements-common.txt
setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
......@@ -5,6 +5,7 @@ import os
import re
import subprocess
import sys
import warnings
from shutil import which
from typing import Dict, List
......@@ -30,6 +31,34 @@ def load_module_from_path(module_name, path):
ROOT_DIR = os.path.dirname(__file__)
logger = logging.getLogger(__name__)
def embed_commit_hash():
try:
if "BUILDKITE_COMMIT" in os.environ:
# ci build
commit_id = os.environ["BUILDKITE_COMMIT"]
else:
commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
encoding="utf-8").strip()
commit_contents = f'__commit__ = "{commit_id}"\n'
version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
with open(version_file, "w", encoding="utf-8") as f:
f.write(commit_contents)
except subprocess.CalledProcessError as e:
warnings.warn(f"Failed to get commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
except Exception as e:
warnings.warn(f"Failed to embed commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
embed_commit_hash()
# cannot import envs directly because it depends on vllm,
# which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
......@@ -144,6 +173,7 @@ class cmake_build_ext(build_ext):
cmake_args += [
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
]
elif is_ccache_available():
cmake_args += [
......@@ -175,7 +205,6 @@ class cmake_build_ext(build_ext):
else:
# Default build tool to whatever cmake picks.
build_tool = []
subprocess.check_call(
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp)
......@@ -210,9 +239,9 @@ class cmake_build_ext(build_ext):
def _is_cuda() -> bool:
return VLLM_TARGET_DEVICE == "cuda" \
and torch.version.cuda is not None \
and not _is_neuron()
has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not (_is_neuron() or _is_tpu()))
def _is_hip() -> bool:
......@@ -229,10 +258,26 @@ def _is_neuron() -> bool:
return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
def _is_tpu() -> bool:
return VLLM_TARGET_DEVICE == "tpu"
def _is_cpu() -> bool:
return VLLM_TARGET_DEVICE == "cpu"
def _is_openvino() -> bool:
return VLLM_TARGET_DEVICE == "openvino"
def _is_xpu() -> bool:
return VLLM_TARGET_DEVICE == "xpu"
def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu()
def _install_punica() -> bool:
return envs.VLLM_INSTALL_PUNICA_KERNELS
......@@ -350,8 +395,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
version += ".torch" + torch.__version__[:5]
with open(add_version_path, encoding="utf-8",mode="w") as file:
file.write("__version__='0.5.0'\n")
file.write("__dcu_version__='0.5.0+{}'\n".format(version))
file.write("__version__='0.5.2'\n")
file.write("__dcu_version__='0.5.2+{}'\n".format(version))
file.close()
......@@ -364,7 +409,7 @@ def get_version():
def get_vllm_version() -> str:
version = find_version(get_path("vllm", "__init__.py"))
# version = find_version(get_path("vllm", "version.py"))
if _is_cuda():
cuda_version = str(get_nvcc_cuda_version())
......@@ -384,8 +429,14 @@ def get_vllm_version() -> str:
if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}"
elif _is_openvino():
version += "+openvino"
elif _is_tpu():
version += "+tpu"
elif _is_cpu():
version += "+cpu"
elif _is_xpu():
version += "+xpu"
else:
raise RuntimeError("Unknown runtime environment")
......@@ -431,11 +482,18 @@ def get_requirements() -> List[str]:
requirements = _read_requirements("requirements-rocm.txt")
elif _is_neuron():
requirements = _read_requirements("requirements-neuron.txt")
elif _is_openvino():
requirements = _read_requirements("requirements-openvino.txt")
elif _is_tpu():
requirements = _read_requirements("requirements-tpu.txt")
elif _is_cpu():
requirements = _read_requirements("requirements-cpu.txt")
elif _is_xpu():
requirements = _read_requirements("requirements-xpu.txt")
else:
raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
"Unsupported platform, please use CUDA, ROCm, Neuron, "
"OpenVINO, or CPU.")
return requirements
......@@ -444,7 +502,7 @@ ext_modules = []
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
if not _is_neuron():
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
if _install_punica():
......@@ -487,6 +545,11 @@ setup(
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
},
cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
package_data=package_data,
entry_points={
"console_scripts": [
"vllm=vllm.scripts:main",
],
},
)
"""vllm.entrypoints.api_server with some extra logging for testing."""
import argparse
from typing import Any, Dict
import uvicorn
......@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response
import vllm.entrypoints.api_server
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.utils import FlexibleArgumentParser
app = vllm.entrypoints.api_server.app
......@@ -33,7 +33,7 @@ def stats() -> Response:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = FlexibleArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser = AsyncEngineArgs.add_cli_args(parser)
......
......@@ -2,8 +2,13 @@ import asyncio
from dataclasses import dataclass
import pytest
import torch
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm import SamplingParams
from vllm.config import ParallelConfig
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from ..utils import wait_for_gpu_memory_to_clear
@dataclass
......@@ -19,8 +24,11 @@ class MockEngine:
self.add_request_calls = 0
self.abort_request_calls = 0
self.request_id = None
# Ugly, remove dependency when possible
self.parallel_config = ParallelConfig(1, 1, False)
async def step_async(self):
async def step_async(self, virtual_engine):
# PP size is 1, ignore virtual engine
self.step_calls += 1
return [RequestOutput(
request_id=self.request_id)] if self.request_id else []
......@@ -28,6 +36,9 @@ class MockEngine:
async def process_model_inputs_async(self, *args, **kwargs):
pass
async def stop_remote_worker_execution_loop_async(self):
pass
def generate(self, request_id):
self.request_id = request_id
......@@ -37,6 +48,7 @@ class MockEngine:
def add_request(self, **kwargs):
del kwargs # Unused
self.add_request_calls += 1
print(f'Request calls: {self.add_request_calls}')
async def add_request_async(self, **kwargs):
self.add_request_calls += 1
......@@ -49,6 +61,9 @@ class MockEngine:
def has_unfinished_requests(self):
return self.request_id is not None
def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
return self.request_id is not None
class MockAsyncLLMEngine(AsyncLLMEngine):
......@@ -72,6 +87,7 @@ async def test_new_requests_event():
engine.engine.generate("2")
await asyncio.sleep(0)
await asyncio.sleep(0)
await asyncio.sleep(0)
assert engine.engine.add_request_calls == 2
assert engine.engine.step_calls >= 2
await asyncio.sleep(0.001)
......@@ -94,3 +110,35 @@ async def test_new_requests_event():
assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None
def test_asyncio_run():
wait_for_gpu_memory_to_clear(
devices=list(range(torch.cuda.device_count())),
threshold_bytes=2 * 2**30,
timeout_s=60,
)
engine = AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m"))
async def run(prompt: str):
sampling_params = SamplingParams(
temperature=0,
max_tokens=32,
)
async for output in engine.generate(prompt,
sampling_params,
request_id=prompt):
final_output = output
return final_output
async def generate():
return await asyncio.gather(
run("test0"),
run("test1"),
)
results = asyncio.run(generate())
assert len(results) == 2
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
from ..utils import ServerRunner
from ..utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
......@@ -12,34 +9,27 @@ MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
def server():
ray.init()
server_runner = ServerRunner.remote([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--enforce-eager",
"--engine-use-ray"
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
with RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--enforce-eager",
"--engine-use-ray"
]) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client():
client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
def client(server):
return server.get_async_client()
@pytest.mark.asyncio
async def test_check_models(server, client: openai.AsyncOpenAI):
async def test_check_models(client: openai.AsyncOpenAI):
models = await client.models.list()
models = models.data
served_model = models[0]
......@@ -48,7 +38,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio
async def test_single_completion(server, client: openai.AsyncOpenAI):
async def test_single_completion(client: openai.AsyncOpenAI):
completion = await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
......@@ -72,7 +62,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio
async def test_single_chat_session(server, client: openai.AsyncOpenAI):
async def test_single_chat_session(client: openai.AsyncOpenAI):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......
......@@ -8,12 +8,14 @@ import weakref
import pytest
from vllm import LLM
from vllm.utils import is_hip
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
def test_vllm_gc_ed():
......@@ -27,6 +29,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
......@@ -35,13 +38,16 @@ def test_models(
vllm_runner,
example_prompts,
model: str,
backend: str,
dtype: str,
max_tokens: int,
enforce_eager: bool,
) -> None:
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
pytest.skip("Skipping non-eager test for FlashInferBackend.")
if backend == "FLASHINFER" and is_hip():
pytest.skip("Flashinfer does not support ROCm/HIP.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
......@@ -52,10 +58,9 @@ def test_models(
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
......@@ -8,6 +8,8 @@ Run `pytest tests/models/test_chunked_prefill.py`.
"""
import pytest
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
......@@ -54,10 +56,9 @@ def test_models(
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
......@@ -12,6 +12,8 @@ from vllm import SamplingParams
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
ENABLE_ARTIFICIAL_PREEMPT)
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
]
......@@ -54,8 +56,8 @@ def test_chunked_prefill_recompute(
max_num_seqs=max_num_seqs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
......@@ -89,18 +91,18 @@ def test_preemption(
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
"is not enough KV cache space." in caplog_vllm.text)
# Ensure the count bucket of request-level histogram metrics matches
......@@ -145,10 +147,10 @@ def test_swap(
) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts,
beam_width, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i]
......@@ -212,8 +214,8 @@ def test_swap_infeasible(
example_prompts,
sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang.
assert req_outputs[0].outputs[0].finish_reason == "length"
......@@ -250,8 +252,8 @@ def test_preemption_infeasible(
sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang.
for req_output in req_outputs:
......
import contextlib
import gc
import os
import subprocess
import sys
from typing import Any, Dict, List, Optional, Tuple, TypeVar
from collections import UserList
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
TypeVar)
import pytest
import torch
......@@ -11,17 +15,17 @@ import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoProcessor, AutoTokenizer, BatchEncoding)
AutoTokenizer, BatchEncoding)
from vllm import LLM, SamplingParams
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
from vllm.distributed import destroy_model_parallel
from vllm.config import TokenizerPoolConfig
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import TextPrompt
from vllm.logger import init_logger
from vllm.multimodal import MultiModalData
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
from vllm.multimodal.utils import fetch_image
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from vllm.utils import cuda_device_count_stateless, is_cpu
logger = init_logger(__name__)
......@@ -29,21 +33,8 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
# Multi modal related
# You can use `.buildkite/download-images.sh` to download the assets
PIXEL_VALUES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
]
IMAGE_FEATURES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
]
IMAGE_FILES = [
os.path.join(_TEST_DIR, "images", filename)
for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
]
assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
_IMAGE_DIR = Path(_TEST_DIR) / "images"
"""You can use `.buildkite/download-images.sh` to download the assets."""
def _read_prompts(filename: str) -> List[str]:
......@@ -52,8 +43,65 @@ def _read_prompts(filename: str) -> List[str]:
return prompts
@dataclass(frozen=True)
class ImageAsset:
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
@cached_property
def pil_image(self) -> Image.Image:
if self.name == "boardwalk":
return fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
)
return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
class _ImageAssetPrompts(TypedDict):
stop_sign: str
cherry_blossom: str
boardwalk: str
if sys.version_info < (3, 9):
# UserList cannot be subscripted
class _ImageAssetsBase(UserList):
pass
else:
class _ImageAssetsBase(UserList[ImageAsset]):
pass
class _ImageAssets(_ImageAssetsBase):
def __init__(self) -> None:
super().__init__([
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
ImageAsset("boardwalk")
])
def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
"""
Convenience method to define the prompt for each test image.
The order of the returned prompts matches the order of the
assets when iterating through this object.
"""
return [
prompts["stop_sign"], prompts["cherry_blossom"],
prompts["boardwalk"]
]
IMAGE_ASSETS = _ImageAssets()
"""Singleton instance of :class:`_ImageAssets`."""
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
......@@ -81,31 +129,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup()
@pytest.fixture(scope="session")
def hf_images() -> List[Image.Image]:
return [Image.open(filename) for filename in IMAGE_FILES]
@pytest.fixture()
def vllm_images(request) -> List[MultiModalData]:
vision_language_config = request.getfixturevalue("model_and_config")[1]
if vision_language_config.image_input_type == (
VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
return [
ImageFeatureData(torch.load(filename))
for filename in IMAGE_FEATURES_FILES
]
else:
return [
ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
]
@pytest.fixture()
def vllm_image_tensors(request) -> List[torch.Tensor]:
return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
@pytest.fixture
def example_prompts() -> List[str]:
prompts = []
......@@ -122,6 +145,11 @@ def example_long_prompts() -> List[str]:
return prompts
@pytest.fixture(scope="session")
def image_assets() -> _ImageAssets:
return IMAGE_ASSETS
_STR_DTYPE_TO_TORCH_DTYPE = {
"half": torch.half,
"bfloat16": torch.bfloat16,
......@@ -144,8 +172,10 @@ class HfRunner:
model_name: str,
dtype: str = "half",
*,
model_kwargs: Optional[Dict[str, Any]] = None,
is_embedding_model: bool = False,
is_vision_model: bool = False,
is_sparseml_model: bool = False,
) -> None:
assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
......@@ -163,14 +193,19 @@ class HfRunner:
else:
if is_vision_model:
auto_cls = AutoModelForVision2Seq
elif is_sparseml_model:
from sparseml.transformers import SparseAutoModelForCausalLM
auto_cls = SparseAutoModelForCausalLM
else:
auto_cls = AutoModelForCausalLM
model_kwargs = model_kwargs if model_kwargs is not None else {}
self.model = self.wrap_device(
auto_cls.from_pretrained(
model_name,
torch_dtype=torch_dtype,
trust_remote_code=True,
**model_kwargs,
))
self.tokenizer = AutoTokenizer.from_pretrained(
......@@ -180,6 +215,9 @@ class HfRunner:
)
try:
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401
self.processor = AutoProcessor.from_pretrained(
model_name,
torch_dtype=torch_dtype,
......@@ -195,7 +233,7 @@ class HfRunner:
self,
prompts: List[str],
images: Optional[List[Image.Image]] = None,
**kwargs,
**kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]:
if images:
assert len(prompts) == len(images)
......@@ -230,11 +268,13 @@ class HfRunner:
prompts: List[str],
max_tokens: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str]]:
outputs = self.generate(prompts,
do_sample=False,
max_new_tokens=max_tokens,
images=images)
images=images,
**kwargs)
return [(output_ids[0], output_str[0])
for output_ids, output_str in outputs]
......@@ -264,19 +304,30 @@ class HfRunner:
self,
prompts: List[str],
max_tokens: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[List[torch.Tensor]]:
all_logprobs = []
for prompt in prompts:
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
all_logprobs: List[List[torch.Tensor]] = []
for i, prompt in enumerate(prompts):
processor_kwargs: Dict[str, Any] = {
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
output = self.model.generate(
self.wrap_device(input_ids),
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
output_hidden_states=True,
return_dict_in_generate=True,
**kwargs,
)
seq_logprobs = []
seq_logprobs: List[torch.Tensor] = []
for hidden_states in output.hidden_states:
last_hidden_states = hidden_states[-1][0]
logits = torch.matmul(
......@@ -296,20 +347,32 @@ class HfRunner:
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
for prompt in prompts:
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
for i, prompt in enumerate(prompts):
processor_kwargs: Dict[str, Any] = {
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
input_ids = inputs.input_ids
output = self.model.generate(
self.wrap_device(input_ids),
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
output_hidden_states=True,
return_dict_in_generate=True,
**kwargs,
)
seq_logprobs: List[torch.Tensor] = []
......@@ -362,7 +425,7 @@ class HfRunner:
cleanup()
@pytest.fixture
@pytest.fixture(scope="session")
def hf_runner():
return HfRunner
......@@ -382,6 +445,7 @@ class VllmRunner:
block_size: int = 16,
enable_chunked_prefill: bool = False,
swap_space: int = 4,
enforce_eager: bool = False,
**kwargs,
) -> None:
self.model = LLM(
......@@ -390,6 +454,7 @@ class VllmRunner:
trust_remote_code=True,
dtype=dtype,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len,
......@@ -402,7 +467,7 @@ class VllmRunner:
self,
prompts: List[str],
sampling_params: SamplingParams,
images: Optional[List[MultiModalData]] = None,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[List[int]], List[str]]]:
if images is not None:
assert len(prompts) == len(images)
......@@ -410,7 +475,7 @@ class VllmRunner:
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = image
inputs[i]["multi_modal_data"] = {"image": image}
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params)
......@@ -423,7 +488,7 @@ class VllmRunner:
req_sample_output_strs: List[str] = []
for sample in req_output.outputs:
output_str = sample.text
output_ids = sample.token_ids
output_ids = list(sample.token_ids)
req_sample_output_ids.append(prompt_ids + output_ids)
req_sample_output_strs.append(prompt_str + output_str)
outputs.append((req_sample_output_ids, req_sample_output_strs))
......@@ -433,10 +498,19 @@ class VllmRunner:
self,
prompts: List[str],
sampling_params: SamplingParams,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
assert sampling_params.logprobs is not None
req_outputs = self.model.generate(prompts,
if images is not None:
assert len(prompts) == len(images)
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = {"image": image}
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params)
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
for req_output in req_outputs:
......@@ -451,7 +525,7 @@ class VllmRunner:
self,
prompts: List[str],
max_tokens: int,
images: Optional[List[MultiModalData]] = None,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, greedy_params, images=images)
......@@ -463,11 +537,14 @@ class VllmRunner:
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
greedy_logprobs_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens,
logprobs=num_logprobs)
outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
outputs = self.generate_w_logprobs(prompts,
greedy_logprobs_params,
images=images)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
......@@ -537,15 +614,4 @@ def num_gpus_available():
"""Get number of GPUs without initializing the CUDA context
in current process."""
try:
out = subprocess.run([
sys.executable, "-c",
"import torch; print(torch.cuda.device_count())"
],
capture_output=True,
check=True,
text=True)
except subprocess.CalledProcessError as e:
logger.warning("Failed to get number of GPUs.", exc_info=e)
return 0
return int(out.stdout.strip())
return cuda_device_count_stateless()
......@@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
# Test APC in v2 block
"use_v2_block_manager": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
test_llm_generator):
"""Verify block manager v2 with auto prefix caching could works normal
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.
Then blocks are managed by evictor instead. If cache hit at the evitor's
block, then it could be reused, or we need to recompute its kv cache.
"""
output_len = 10
temperature = 0.0
prompts = [
"You are a helpful assistant. Please answer truthfully and write "
"out your thinking step by step to be sure you get the right answer. "
"If you make a mistake, attempt to correct it. who are you?",
"You are a helpful assistant. Please answer truthfully and write out "
"your thinking step by step to be sure you get the right answer. You "
"are helpful and harmless and you follow ethical guidelines. "
"who are you?"
]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with APC disabled')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with APC enabled')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
from typing import List
import pytest
from vllm.core.block.block_table import BlockTable
......@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_tables = []
block_tables: List[BlockTable] = []
for i in range(5):
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
......@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
num_immutable_blocks_per_alloc = len(
chunked_tokens) - num_mutable_blocks_per_alloc
block_tables = []
block_tables: List[BlockTable] = []
for alloc_i in range(1, 6):
block_tables.append(
......@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)
appended_so_far = []
appended_so_far: List[int] = []
for append in chunk_list(token_ids_to_append, append_size):
block_table.append_token_ids(append)
appended_so_far.extend(append)
......@@ -371,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
block_size) - (sequence_len // block_size)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
original_block_ids = original_block_table.physical_block_ids
original_block_ids = original_block_table.physical_block_ids[:]
print("original_block_ids = {}".format(original_block_ids))
forked_block_table = original_block_table.fork()
# Expect no additional allocation (copy on _write_).
......@@ -455,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
# Allocate lookahead slots.
original_block_table.ensure_num_empty_slots(lookahead_slots)
original_block_ids = original_block_table.physical_block_ids
original_block_ids = original_block_table.physical_block_ids[:]
forked_block_table = original_block_table.fork()
......
......@@ -8,8 +8,8 @@ from vllm.utils import Device, chunk_list
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
......@@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_mutable(prev_block=None, device=Device.CPU)
allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
for _ in range(num_cpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_mutable(prev_block=None, device=Device.GPU)
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
for _ in range(num_gpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
......@@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [2])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
......@@ -67,18 +67,18 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_immutable(prev_block=None,
token_ids=token_ids,
device=Device.CPU)
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.CPU)
for token_ids in cpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_immutable(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
for token_ids in gpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment