Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray >= 2.9 ray >= 2.9
nvidia-ml-py # for pynvml package nvidia-ml-py # for pynvml package
torch == 2.3.0 torch == 2.3.1
xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 # These must be updated alongside torch
vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0 torchvision == 0.18.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27 # Requires PyTorch 2.3.1
vllm-flash-attn == 2.5.9.post1 # Requires PyTorch 2.3.1
# formatting -r requirements-lint.txt
yapf==0.32.0 -r requirements-test.txt
toml==0.10.2
tomli==2.0.1
ruff==0.1.5
codespell==2.2.6
isort==5.13.2
clang-format==18.1.5
# type checking # Avoid adding requirements directly to this file.
mypy==1.9.0 # Instead, modify the two files referenced above.
types-PyYAML
types-requests
types-setuptools
# testing
pytest
tensorizer>=2.9.0
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
# testing utils
awscli
einops # required for MPT
httpx
peft
requests
ray
sentence-transformers # required for embedding
# Benchmarking
aiohttp
# quantization
bitsandbytes==0.42.0
# formatting
yapf==0.32.0
toml==0.10.2
tomli==2.0.1
ruff==0.1.5
codespell==2.3.0
isort==5.13.2
clang-format==18.1.5
# type checking
mypy==1.9.0
types-PyYAML
types-requests
types-setuptools
# Mamba dependencies
mamba-ssm>=1.2.2
causal-conv1d>=1.2.0
# Common dependencies
-r requirements-common.txt
# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
optimum-intel[openvino] >= 1.18.1
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
...@@ -2,6 +2,5 @@ ...@@ -2,6 +2,5 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for AMD GPUs # Dependencies for AMD GPUs
ray == 2.9.1 ray >= 2.10.0
# ray >= 2.10.0
pytest-asyncio pytest-asyncio
# testing
pytest
tensorizer>=2.9.0
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard
# testing utils
awscli
einops # required for MPT
httpx
peft
requests
ray
sentence-transformers # required for embedding
sparseml==1.8.0 # required for compressed-tensors
compressed-tensors==0.4.0 # required for compressed-tensors
# Benchmarking
aiohttp
# quantization
bitsandbytes==0.42.0
\ No newline at end of file
# Common dependencies
-r requirements-common.txt
# Dependencies for TPU
# Currently, the TPU backend uses a nightly version of PyTorch XLA.
# You can install the dependencies in Dockerfile.tpu.
triton # To avoid import errors
# Common dependencies
-r requirements-common.txt
setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
...@@ -5,6 +5,7 @@ import os ...@@ -5,6 +5,7 @@ import os
import re import re
import subprocess import subprocess
import sys import sys
import warnings
from shutil import which from shutil import which
from typing import Dict, List from typing import Dict, List
...@@ -30,6 +31,34 @@ def load_module_from_path(module_name, path): ...@@ -30,6 +31,34 @@ def load_module_from_path(module_name, path):
ROOT_DIR = os.path.dirname(__file__) ROOT_DIR = os.path.dirname(__file__)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def embed_commit_hash():
try:
if "BUILDKITE_COMMIT" in os.environ:
# ci build
commit_id = os.environ["BUILDKITE_COMMIT"]
else:
commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
encoding="utf-8").strip()
commit_contents = f'__commit__ = "{commit_id}"\n'
version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
with open(version_file, "w", encoding="utf-8") as f:
f.write(commit_contents)
except subprocess.CalledProcessError as e:
warnings.warn(f"Failed to get commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
except Exception as e:
warnings.warn(f"Failed to embed commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
embed_commit_hash()
# cannot import envs directly because it depends on vllm, # cannot import envs directly because it depends on vllm,
# which is not installed yet # which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
...@@ -144,6 +173,7 @@ class cmake_build_ext(build_ext): ...@@ -144,6 +173,7 @@ class cmake_build_ext(build_ext):
cmake_args += [ cmake_args += [
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
] ]
elif is_ccache_available(): elif is_ccache_available():
cmake_args += [ cmake_args += [
...@@ -175,7 +205,6 @@ class cmake_build_ext(build_ext): ...@@ -175,7 +205,6 @@ class cmake_build_ext(build_ext):
else: else:
# Default build tool to whatever cmake picks. # Default build tool to whatever cmake picks.
build_tool = [] build_tool = []
subprocess.check_call( subprocess.check_call(
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
cwd=self.build_temp) cwd=self.build_temp)
...@@ -210,9 +239,9 @@ class cmake_build_ext(build_ext): ...@@ -210,9 +239,9 @@ class cmake_build_ext(build_ext):
def _is_cuda() -> bool: def _is_cuda() -> bool:
return VLLM_TARGET_DEVICE == "cuda" \ has_cuda = torch.version.cuda is not None
and torch.version.cuda is not None \ return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not _is_neuron() and not (_is_neuron() or _is_tpu()))
def _is_hip() -> bool: def _is_hip() -> bool:
...@@ -229,10 +258,26 @@ def _is_neuron() -> bool: ...@@ -229,10 +258,26 @@ def _is_neuron() -> bool:
return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron" return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
def _is_tpu() -> bool:
return VLLM_TARGET_DEVICE == "tpu"
def _is_cpu() -> bool: def _is_cpu() -> bool:
return VLLM_TARGET_DEVICE == "cpu" return VLLM_TARGET_DEVICE == "cpu"
def _is_openvino() -> bool:
return VLLM_TARGET_DEVICE == "openvino"
def _is_xpu() -> bool:
return VLLM_TARGET_DEVICE == "xpu"
def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu()
def _install_punica() -> bool: def _install_punica() -> bool:
return envs.VLLM_INSTALL_PUNICA_KERNELS return envs.VLLM_INSTALL_PUNICA_KERNELS
...@@ -350,8 +395,8 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -350,8 +395,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
version += ".torch" + torch.__version__[:5] version += ".torch" + torch.__version__[:5]
with open(add_version_path, encoding="utf-8",mode="w") as file: with open(add_version_path, encoding="utf-8",mode="w") as file:
file.write("__version__='0.5.0'\n") file.write("__version__='0.5.2'\n")
file.write("__dcu_version__='0.5.0+{}'\n".format(version)) file.write("__dcu_version__='0.5.2+{}'\n".format(version))
file.close() file.close()
...@@ -364,7 +409,7 @@ def get_version(): ...@@ -364,7 +409,7 @@ def get_version():
def get_vllm_version() -> str: def get_vllm_version() -> str:
version = find_version(get_path("vllm", "__init__.py")) # version = find_version(get_path("vllm", "version.py"))
if _is_cuda(): if _is_cuda():
cuda_version = str(get_nvcc_cuda_version()) cuda_version = str(get_nvcc_cuda_version())
...@@ -384,8 +429,14 @@ def get_vllm_version() -> str: ...@@ -384,8 +429,14 @@ def get_vllm_version() -> str:
if neuron_version != MAIN_CUDA_VERSION: if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3] neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}" version += f"+neuron{neuron_version_str}"
elif _is_openvino():
version += "+openvino"
elif _is_tpu():
version += "+tpu"
elif _is_cpu(): elif _is_cpu():
version += "+cpu" version += "+cpu"
elif _is_xpu():
version += "+xpu"
else: else:
raise RuntimeError("Unknown runtime environment") raise RuntimeError("Unknown runtime environment")
...@@ -431,11 +482,18 @@ def get_requirements() -> List[str]: ...@@ -431,11 +482,18 @@ def get_requirements() -> List[str]:
requirements = _read_requirements("requirements-rocm.txt") requirements = _read_requirements("requirements-rocm.txt")
elif _is_neuron(): elif _is_neuron():
requirements = _read_requirements("requirements-neuron.txt") requirements = _read_requirements("requirements-neuron.txt")
elif _is_openvino():
requirements = _read_requirements("requirements-openvino.txt")
elif _is_tpu():
requirements = _read_requirements("requirements-tpu.txt")
elif _is_cpu(): elif _is_cpu():
requirements = _read_requirements("requirements-cpu.txt") requirements = _read_requirements("requirements-cpu.txt")
elif _is_xpu():
requirements = _read_requirements("requirements-xpu.txt")
else: else:
raise ValueError( raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") "Unsupported platform, please use CUDA, ROCm, Neuron, "
"OpenVINO, or CPU.")
return requirements return requirements
...@@ -444,7 +502,7 @@ ext_modules = [] ...@@ -444,7 +502,7 @@ ext_modules = []
if _is_cuda() or _is_hip(): if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C")) ext_modules.append(CMakeExtension(name="vllm._moe_C"))
if not _is_neuron(): if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
if _install_punica(): if _install_punica():
...@@ -487,6 +545,11 @@ setup( ...@@ -487,6 +545,11 @@ setup(
extras_require={ extras_require={
"tensorizer": ["tensorizer>=2.9.0"], "tensorizer": ["tensorizer>=2.9.0"],
}, },
cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {}, cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
package_data=package_data, package_data=package_data,
entry_points={
"console_scripts": [
"vllm=vllm.scripts:main",
],
},
) )
"""vllm.entrypoints.api_server with some extra logging for testing.""" """vllm.entrypoints.api_server with some extra logging for testing."""
import argparse
from typing import Any, Dict from typing import Any, Dict
import uvicorn import uvicorn
...@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response ...@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response
import vllm.entrypoints.api_server import vllm.entrypoints.api_server
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.utils import FlexibleArgumentParser
app = vllm.entrypoints.api_server.app app = vllm.entrypoints.api_server.app
...@@ -33,7 +33,7 @@ def stats() -> Response: ...@@ -33,7 +33,7 @@ def stats() -> Response:
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = FlexibleArgumentParser()
parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
......
...@@ -2,8 +2,13 @@ import asyncio ...@@ -2,8 +2,13 @@ import asyncio
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
import torch
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm import SamplingParams
from vllm.config import ParallelConfig
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from ..utils import wait_for_gpu_memory_to_clear
@dataclass @dataclass
...@@ -19,8 +24,11 @@ class MockEngine: ...@@ -19,8 +24,11 @@ class MockEngine:
self.add_request_calls = 0 self.add_request_calls = 0
self.abort_request_calls = 0 self.abort_request_calls = 0
self.request_id = None self.request_id = None
# Ugly, remove dependency when possible
self.parallel_config = ParallelConfig(1, 1, False)
async def step_async(self): async def step_async(self, virtual_engine):
# PP size is 1, ignore virtual engine
self.step_calls += 1 self.step_calls += 1
return [RequestOutput( return [RequestOutput(
request_id=self.request_id)] if self.request_id else [] request_id=self.request_id)] if self.request_id else []
...@@ -28,6 +36,9 @@ class MockEngine: ...@@ -28,6 +36,9 @@ class MockEngine:
async def process_model_inputs_async(self, *args, **kwargs): async def process_model_inputs_async(self, *args, **kwargs):
pass pass
async def stop_remote_worker_execution_loop_async(self):
pass
def generate(self, request_id): def generate(self, request_id):
self.request_id = request_id self.request_id = request_id
...@@ -37,6 +48,7 @@ class MockEngine: ...@@ -37,6 +48,7 @@ class MockEngine:
def add_request(self, **kwargs): def add_request(self, **kwargs):
del kwargs # Unused del kwargs # Unused
self.add_request_calls += 1 self.add_request_calls += 1
print(f'Request calls: {self.add_request_calls}')
async def add_request_async(self, **kwargs): async def add_request_async(self, **kwargs):
self.add_request_calls += 1 self.add_request_calls += 1
...@@ -49,6 +61,9 @@ class MockEngine: ...@@ -49,6 +61,9 @@ class MockEngine:
def has_unfinished_requests(self): def has_unfinished_requests(self):
return self.request_id is not None return self.request_id is not None
def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
return self.request_id is not None
class MockAsyncLLMEngine(AsyncLLMEngine): class MockAsyncLLMEngine(AsyncLLMEngine):
...@@ -72,6 +87,7 @@ async def test_new_requests_event(): ...@@ -72,6 +87,7 @@ async def test_new_requests_event():
engine.engine.generate("2") engine.engine.generate("2")
await asyncio.sleep(0) await asyncio.sleep(0)
await asyncio.sleep(0) await asyncio.sleep(0)
await asyncio.sleep(0)
assert engine.engine.add_request_calls == 2 assert engine.engine.add_request_calls == 2
assert engine.engine.step_calls >= 2 assert engine.engine.step_calls >= 2
await asyncio.sleep(0.001) await asyncio.sleep(0.001)
...@@ -94,3 +110,35 @@ async def test_new_requests_event(): ...@@ -94,3 +110,35 @@ async def test_new_requests_event():
assert engine.get_model_config() is not None assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None assert engine.get_decoding_config() is not None
def test_asyncio_run():
wait_for_gpu_memory_to_clear(
devices=list(range(torch.cuda.device_count())),
threshold_bytes=2 * 2**30,
timeout_s=60,
)
engine = AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m"))
async def run(prompt: str):
sampling_params = SamplingParams(
temperature=0,
max_tokens=32,
)
async for output in engine.generate(prompt,
sampling_params,
request_id=prompt):
final_output = output
return final_output
async def generate():
return await asyncio.gather(
run("test0"),
run("test1"),
)
results = asyncio.run(generate())
assert len(results) == 2
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
from ..utils import ServerRunner from ..utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m" MODEL_NAME = "facebook/opt-125m"
...@@ -12,34 +9,27 @@ MODEL_NAME = "facebook/opt-125m" ...@@ -12,34 +9,27 @@ MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
ray.init() with RemoteOpenAIServer([
server_runner = ServerRunner.remote([ "--model",
"--model", MODEL_NAME,
MODEL_NAME, # use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment "--dtype",
"--dtype", "float16",
"float16", "--max-model-len",
"--max-model-len", "2048",
"2048", "--enforce-eager",
"--enforce-eager", "--engine-use-ray"
"--engine-use-ray" ]) as remote_server:
]) yield remote_server
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def client(): def client(server):
client = openai.AsyncOpenAI( return server.get_async_client()
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_models(server, client: openai.AsyncOpenAI): async def test_check_models(client: openai.AsyncOpenAI):
models = await client.models.list() models = await client.models.list()
models = models.data models = models.data
served_model = models[0] served_model = models[0]
...@@ -48,7 +38,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): ...@@ -48,7 +38,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_single_completion(server, client: openai.AsyncOpenAI): async def test_single_completion(client: openai.AsyncOpenAI):
completion = await client.completions.create(model=MODEL_NAME, completion = await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=5, max_tokens=5,
...@@ -72,7 +62,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): ...@@ -72,7 +62,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_single_chat_session(server, client: openai.AsyncOpenAI): async def test_single_chat_session(client: openai.AsyncOpenAI):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
......
...@@ -8,12 +8,14 @@ import weakref ...@@ -8,12 +8,14 @@ import weakref
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.utils import is_hip
from ..models.utils import check_outputs_equal
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-hf",
] ]
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
def test_vllm_gc_ed(): def test_vllm_gc_ed():
...@@ -27,6 +29,7 @@ def test_vllm_gc_ed(): ...@@ -27,6 +29,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True]) @pytest.mark.parametrize("enforce_eager", [False, True])
...@@ -35,13 +38,16 @@ def test_models( ...@@ -35,13 +38,16 @@ def test_models(
vllm_runner, vllm_runner,
example_prompts, example_prompts,
model: str, model: str,
backend: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
) -> None: ) -> None:
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
if backend_by_env_var == "FLASHINFER" and enforce_eager is False: if backend == "FLASHINFER" and is_hip():
pytest.skip("Skipping non-eager test for FlashInferBackend.") pytest.skip("Flashinfer does not support ROCm/HIP.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
...@@ -52,10 +58,9 @@ def test_models( ...@@ -52,10 +58,9 @@ def test_models(
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)): check_outputs_equal(
hf_output_ids, hf_output_str = hf_outputs[i] outputs_0_lst=hf_outputs,
vllm_output_ids, vllm_output_str = vllm_outputs[i] outputs_1_lst=vllm_outputs,
assert hf_output_str == vllm_output_str, ( name_0="hf",
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") name_1="vllm",
assert hf_output_ids == vllm_output_ids, ( )
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
...@@ -8,6 +8,8 @@ Run `pytest tests/models/test_chunked_prefill.py`. ...@@ -8,6 +8,8 @@ Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import pytest import pytest
from ..models.utils import check_outputs_equal
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-hf",
...@@ -54,10 +56,9 @@ def test_models( ...@@ -54,10 +56,9 @@ def test_models(
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)): check_outputs_equal(
hf_output_ids, hf_output_str = hf_outputs[i] outputs_0_lst=hf_outputs,
vllm_output_ids, vllm_output_str = vllm_outputs[i] outputs_1_lst=vllm_outputs,
assert hf_output_str == vllm_output_str, ( name_0="hf",
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") name_1="vllm",
assert hf_output_ids == vllm_output_ids, ( )
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
...@@ -12,6 +12,8 @@ from vllm import SamplingParams ...@@ -12,6 +12,8 @@ from vllm import SamplingParams
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
ENABLE_ARTIFICIAL_PREEMPT) ENABLE_ARTIFICIAL_PREEMPT)
from ..models.utils import check_outputs_equal
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
] ]
...@@ -54,8 +56,8 @@ def test_chunked_prefill_recompute( ...@@ -54,8 +56,8 @@ def test_chunked_prefill_recompute(
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
ARTIFICIAL_PREEMPTION_MAX_CNT) < ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
...@@ -89,18 +91,18 @@ def test_preemption( ...@@ -89,18 +91,18 @@ def test_preemption(
disable_log_stats=False, disable_log_stats=False,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
ARTIFICIAL_PREEMPTION_MAX_CNT) < ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = ( total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there " assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
"is not enough KV cache space." in caplog_vllm.text) "is not enough KV cache space." in caplog_vllm.text)
# Ensure the count bucket of request-level histogram metrics matches # Ensure the count bucket of request-level histogram metrics matches
...@@ -145,10 +147,10 @@ def test_swap( ...@@ -145,10 +147,10 @@ def test_swap(
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts, vllm_outputs = vllm_model.generate_beam_search(example_prompts,
beam_width, max_tokens) beam_width, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
ARTIFICIAL_PREEMPTION_MAX_CNT) < ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = ( total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i] hf_output_ids, _ = hf_outputs[i]
...@@ -212,8 +214,8 @@ def test_swap_infeasible( ...@@ -212,8 +214,8 @@ def test_swap_infeasible(
example_prompts, example_prompts,
sampling_params=sampling_params, sampling_params=sampling_params,
) )
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
ARTIFICIAL_PREEMPTION_MAX_CNT) < ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang. # Verify the request is ignored and not hang.
assert req_outputs[0].outputs[0].finish_reason == "length" assert req_outputs[0].outputs[0].finish_reason == "length"
...@@ -250,8 +252,8 @@ def test_preemption_infeasible( ...@@ -250,8 +252,8 @@ def test_preemption_infeasible(
sampling_params=sampling_params, sampling_params=sampling_params,
) )
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
ARTIFICIAL_PREEMPTION_MAX_CNT) < ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang. # Verify the request is ignored and not hang.
for req_output in req_outputs: for req_output in req_outputs:
......
import contextlib import contextlib
import gc import gc
import os import os
import subprocess
import sys import sys
from typing import Any, Dict, List, Optional, Tuple, TypeVar from collections import UserList
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
TypeVar)
import pytest import pytest
import torch import torch
...@@ -11,17 +15,17 @@ import torch.nn as nn ...@@ -11,17 +15,17 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image from PIL import Image
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoProcessor, AutoTokenizer, BatchEncoding) AutoTokenizer, BatchEncoding)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig from vllm.config import TokenizerPoolConfig
from vllm.distributed import destroy_model_parallel from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import TextPrompt from vllm.inputs import TextPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.multimodal import MultiModalData from vllm.multimodal.utils import fetch_image
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu from vllm.utils import cuda_device_count_stateless, is_cpu
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -29,21 +33,8 @@ _TEST_DIR = os.path.dirname(__file__) ...@@ -29,21 +33,8 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
# Multi modal related _IMAGE_DIR = Path(_TEST_DIR) / "images"
# You can use `.buildkite/download-images.sh` to download the assets """You can use `.buildkite/download-images.sh` to download the assets."""
PIXEL_VALUES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
]
IMAGE_FEATURES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
]
IMAGE_FILES = [
os.path.join(_TEST_DIR, "images", filename)
for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
]
assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
def _read_prompts(filename: str) -> List[str]: def _read_prompts(filename: str) -> List[str]:
...@@ -52,8 +43,65 @@ def _read_prompts(filename: str) -> List[str]: ...@@ -52,8 +43,65 @@ def _read_prompts(filename: str) -> List[str]:
return prompts return prompts
@dataclass(frozen=True)
class ImageAsset:
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
@cached_property
def pil_image(self) -> Image.Image:
if self.name == "boardwalk":
return fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
)
return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
class _ImageAssetPrompts(TypedDict):
stop_sign: str
cherry_blossom: str
boardwalk: str
if sys.version_info < (3, 9):
# UserList cannot be subscripted
class _ImageAssetsBase(UserList):
pass
else:
class _ImageAssetsBase(UserList[ImageAsset]):
pass
class _ImageAssets(_ImageAssetsBase):
def __init__(self) -> None:
super().__init__([
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
ImageAsset("boardwalk")
])
def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
"""
Convenience method to define the prompt for each test image.
The order of the returned prompts matches the order of the
assets when iterating through this object.
"""
return [
prompts["stop_sign"], prompts["cherry_blossom"],
prompts["boardwalk"]
]
IMAGE_ASSETS = _ImageAssets()
"""Singleton instance of :class:`_ImageAssets`."""
def cleanup(): def cleanup():
destroy_model_parallel() destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError): with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()
gc.collect() gc.collect()
...@@ -81,31 +129,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): ...@@ -81,31 +129,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup() cleanup()
@pytest.fixture(scope="session")
def hf_images() -> List[Image.Image]:
return [Image.open(filename) for filename in IMAGE_FILES]
@pytest.fixture()
def vllm_images(request) -> List[MultiModalData]:
vision_language_config = request.getfixturevalue("model_and_config")[1]
if vision_language_config.image_input_type == (
VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
return [
ImageFeatureData(torch.load(filename))
for filename in IMAGE_FEATURES_FILES
]
else:
return [
ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
]
@pytest.fixture()
def vllm_image_tensors(request) -> List[torch.Tensor]:
return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
@pytest.fixture @pytest.fixture
def example_prompts() -> List[str]: def example_prompts() -> List[str]:
prompts = [] prompts = []
...@@ -122,6 +145,11 @@ def example_long_prompts() -> List[str]: ...@@ -122,6 +145,11 @@ def example_long_prompts() -> List[str]:
return prompts return prompts
@pytest.fixture(scope="session")
def image_assets() -> _ImageAssets:
return IMAGE_ASSETS
_STR_DTYPE_TO_TORCH_DTYPE = { _STR_DTYPE_TO_TORCH_DTYPE = {
"half": torch.half, "half": torch.half,
"bfloat16": torch.bfloat16, "bfloat16": torch.bfloat16,
...@@ -144,8 +172,10 @@ class HfRunner: ...@@ -144,8 +172,10 @@ class HfRunner:
model_name: str, model_name: str,
dtype: str = "half", dtype: str = "half",
*, *,
model_kwargs: Optional[Dict[str, Any]] = None,
is_embedding_model: bool = False, is_embedding_model: bool = False,
is_vision_model: bool = False, is_vision_model: bool = False,
is_sparseml_model: bool = False,
) -> None: ) -> None:
assert dtype in _STR_DTYPE_TO_TORCH_DTYPE assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
...@@ -163,14 +193,19 @@ class HfRunner: ...@@ -163,14 +193,19 @@ class HfRunner:
else: else:
if is_vision_model: if is_vision_model:
auto_cls = AutoModelForVision2Seq auto_cls = AutoModelForVision2Seq
elif is_sparseml_model:
from sparseml.transformers import SparseAutoModelForCausalLM
auto_cls = SparseAutoModelForCausalLM
else: else:
auto_cls = AutoModelForCausalLM auto_cls = AutoModelForCausalLM
model_kwargs = model_kwargs if model_kwargs is not None else {}
self.model = self.wrap_device( self.model = self.wrap_device(
auto_cls.from_pretrained( auto_cls.from_pretrained(
model_name, model_name,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
trust_remote_code=True, trust_remote_code=True,
**model_kwargs,
)) ))
self.tokenizer = AutoTokenizer.from_pretrained( self.tokenizer = AutoTokenizer.from_pretrained(
...@@ -180,6 +215,9 @@ class HfRunner: ...@@ -180,6 +215,9 @@ class HfRunner:
) )
try: try:
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
model_name, model_name,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
...@@ -195,7 +233,7 @@ class HfRunner: ...@@ -195,7 +233,7 @@ class HfRunner:
self, self,
prompts: List[str], prompts: List[str],
images: Optional[List[Image.Image]] = None, images: Optional[List[Image.Image]] = None,
**kwargs, **kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> List[Tuple[List[List[int]], List[str]]]:
if images: if images:
assert len(prompts) == len(images) assert len(prompts) == len(images)
...@@ -230,11 +268,13 @@ class HfRunner: ...@@ -230,11 +268,13 @@ class HfRunner:
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional[List[Image.Image]] = None, images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
outputs = self.generate(prompts, outputs = self.generate(prompts,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
images=images) images=images,
**kwargs)
return [(output_ids[0], output_str[0]) return [(output_ids[0], output_str[0])
for output_ids, output_str in outputs] for output_ids, output_str in outputs]
...@@ -264,19 +304,30 @@ class HfRunner: ...@@ -264,19 +304,30 @@ class HfRunner:
self, self,
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[List[torch.Tensor]]: ) -> List[List[torch.Tensor]]:
all_logprobs = [] all_logprobs: List[List[torch.Tensor]] = []
for prompt in prompts: for i, prompt in enumerate(prompts):
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids processor_kwargs: Dict[str, Any] = {
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
output = self.model.generate( output = self.model.generate(
self.wrap_device(input_ids), **self.wrap_device(inputs),
use_cache=True, use_cache=True,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
output_hidden_states=True, output_hidden_states=True,
return_dict_in_generate=True, return_dict_in_generate=True,
**kwargs,
) )
seq_logprobs = [] seq_logprobs: List[torch.Tensor] = []
for hidden_states in output.hidden_states: for hidden_states in output.hidden_states:
last_hidden_states = hidden_states[-1][0] last_hidden_states = hidden_states[-1][0]
logits = torch.matmul( logits = torch.matmul(
...@@ -296,20 +347,32 @@ class HfRunner: ...@@ -296,20 +347,32 @@ class HfRunner:
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]: ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
all_logprobs: List[List[Dict[int, float]]] = [] all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = [] all_output_ids: List[List[int]] = []
all_output_strs: List[str] = [] all_output_strs: List[str] = []
for prompt in prompts: for i, prompt in enumerate(prompts):
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids processor_kwargs: Dict[str, Any] = {
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
input_ids = inputs.input_ids
output = self.model.generate( output = self.model.generate(
self.wrap_device(input_ids), **self.wrap_device(inputs),
use_cache=True, use_cache=True,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
output_hidden_states=True, output_hidden_states=True,
return_dict_in_generate=True, return_dict_in_generate=True,
**kwargs,
) )
seq_logprobs: List[torch.Tensor] = [] seq_logprobs: List[torch.Tensor] = []
...@@ -362,7 +425,7 @@ class HfRunner: ...@@ -362,7 +425,7 @@ class HfRunner:
cleanup() cleanup()
@pytest.fixture @pytest.fixture(scope="session")
def hf_runner(): def hf_runner():
return HfRunner return HfRunner
...@@ -382,6 +445,7 @@ class VllmRunner: ...@@ -382,6 +445,7 @@ class VllmRunner:
block_size: int = 16, block_size: int = 16,
enable_chunked_prefill: bool = False, enable_chunked_prefill: bool = False,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: bool = False,
**kwargs, **kwargs,
) -> None: ) -> None:
self.model = LLM( self.model = LLM(
...@@ -390,6 +454,7 @@ class VllmRunner: ...@@ -390,6 +454,7 @@ class VllmRunner:
trust_remote_code=True, trust_remote_code=True,
dtype=dtype, dtype=dtype,
swap_space=swap_space, swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len, max_model_len=max_model_len,
...@@ -402,7 +467,7 @@ class VllmRunner: ...@@ -402,7 +467,7 @@ class VllmRunner:
self, self,
prompts: List[str], prompts: List[str],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional[List[MultiModalData]] = None, images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> List[Tuple[List[List[int]], List[str]]]:
if images is not None: if images is not None:
assert len(prompts) == len(images) assert len(prompts) == len(images)
...@@ -410,7 +475,7 @@ class VllmRunner: ...@@ -410,7 +475,7 @@ class VllmRunner:
inputs = [TextPrompt(prompt=prompt) for prompt in prompts] inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None: if images is not None:
for i, image in enumerate(images): for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = image inputs[i]["multi_modal_data"] = {"image": image}
req_outputs = self.model.generate(inputs, req_outputs = self.model.generate(inputs,
sampling_params=sampling_params) sampling_params=sampling_params)
...@@ -423,7 +488,7 @@ class VllmRunner: ...@@ -423,7 +488,7 @@ class VllmRunner:
req_sample_output_strs: List[str] = [] req_sample_output_strs: List[str] = []
for sample in req_output.outputs: for sample in req_output.outputs:
output_str = sample.text output_str = sample.text
output_ids = sample.token_ids output_ids = list(sample.token_ids)
req_sample_output_ids.append(prompt_ids + output_ids) req_sample_output_ids.append(prompt_ids + output_ids)
req_sample_output_strs.append(prompt_str + output_str) req_sample_output_strs.append(prompt_str + output_str)
outputs.append((req_sample_output_ids, req_sample_output_strs)) outputs.append((req_sample_output_ids, req_sample_output_strs))
...@@ -433,10 +498,19 @@ class VllmRunner: ...@@ -433,10 +498,19 @@ class VllmRunner:
self, self,
prompts: List[str], prompts: List[str],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
assert sampling_params.logprobs is not None assert sampling_params.logprobs is not None
req_outputs = self.model.generate(prompts, if images is not None:
assert len(prompts) == len(images)
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = {"image": image}
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params) sampling_params=sampling_params)
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = [] outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
for req_output in req_outputs: for req_output in req_outputs:
...@@ -451,7 +525,7 @@ class VllmRunner: ...@@ -451,7 +525,7 @@ class VllmRunner:
self, self,
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional[List[MultiModalData]] = None, images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, greedy_params, images=images) outputs = self.generate(prompts, greedy_params, images=images)
...@@ -463,11 +537,14 @@ class VllmRunner: ...@@ -463,11 +537,14 @@ class VllmRunner:
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
greedy_logprobs_params = SamplingParams(temperature=0.0, greedy_logprobs_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens, max_tokens=max_tokens,
logprobs=num_logprobs) logprobs=num_logprobs)
outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params) outputs = self.generate_w_logprobs(prompts,
greedy_logprobs_params,
images=images)
return [(output_ids, output_str, output_logprobs) return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs] for output_ids, output_str, output_logprobs in outputs]
...@@ -537,15 +614,4 @@ def num_gpus_available(): ...@@ -537,15 +614,4 @@ def num_gpus_available():
"""Get number of GPUs without initializing the CUDA context """Get number of GPUs without initializing the CUDA context
in current process.""" in current process."""
try: return cuda_device_count_stateless()
out = subprocess.run([
sys.executable, "-c",
"import torch; print(torch.cuda.device_count())"
],
capture_output=True,
check=True,
text=True)
except subprocess.CalledProcessError as e:
logger.warning("Failed to get number of GPUs.", exc_info=e)
return 0
return int(out.stdout.strip())
...@@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, ...@@ -477,3 +477,70 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
assert expected_token_ids == actual_token_ids assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
# Test APC in v2 block
"use_v2_block_manager": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
test_llm_generator):
"""Verify block manager v2 with auto prefix caching could works normal
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.
Then blocks are managed by evictor instead. If cache hit at the evitor's
block, then it could be reused, or we need to recompute its kv cache.
"""
output_len = 10
temperature = 0.0
prompts = [
"You are a helpful assistant. Please answer truthfully and write "
"out your thinking step by step to be sure you get the right answer. "
"If you make a mistake, attempt to correct it. who are you?",
"You are a helpful assistant. Please answer truthfully and write out "
"your thinking step by step to be sure you get the right answer. You "
"are helpful and harmless and you follow ethical guidelines. "
"who are you?"
]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with APC disabled')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with APC enabled')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
from typing import List
import pytest import pytest
from vllm.core.block.block_table import BlockTable from vllm.core.block.block_table import BlockTable
...@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int): ...@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
token_ids = list(range(sequence_len)) token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_tables = [] block_tables: List[BlockTable] = []
for i in range(5): for i in range(5):
assert allocator.get_num_free_blocks( assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
...@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int): ...@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
num_immutable_blocks_per_alloc = len( num_immutable_blocks_per_alloc = len(
chunked_tokens) - num_mutable_blocks_per_alloc chunked_tokens) - num_mutable_blocks_per_alloc
block_tables = [] block_tables: List[BlockTable] = []
for alloc_i in range(1, 6): for alloc_i in range(1, 6):
block_tables.append( block_tables.append(
...@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int, ...@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
) )
block_table.allocate(token_ids=token_ids, device=Device.GPU) block_table.allocate(token_ids=token_ids, device=Device.GPU)
appended_so_far = [] appended_so_far: List[int] = []
for append in chunk_list(token_ids_to_append, append_size): for append in chunk_list(token_ids_to_append, append_size):
block_table.append_token_ids(append) block_table.append_token_ids(append)
appended_so_far.extend(append) appended_so_far.extend(append)
...@@ -371,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int, ...@@ -371,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
block_size) - (sequence_len // block_size) block_size) - (sequence_len // block_size)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU) original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
original_block_ids = original_block_table.physical_block_ids original_block_ids = original_block_table.physical_block_ids[:]
print("original_block_ids = {}".format(original_block_ids))
forked_block_table = original_block_table.fork() forked_block_table = original_block_table.fork()
# Expect no additional allocation (copy on _write_). # Expect no additional allocation (copy on _write_).
...@@ -455,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int, ...@@ -455,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
# Allocate lookahead slots. # Allocate lookahead slots.
original_block_table.ensure_num_empty_slots(lookahead_slots) original_block_table.ensure_num_empty_slots(lookahead_slots)
original_block_ids = original_block_table.physical_block_ids original_block_ids = original_block_table.physical_block_ids[:]
forked_block_table = original_block_table.fork() forked_block_table = original_block_table.fork()
......
...@@ -8,8 +8,8 @@ from vllm.utils import Device, chunk_list ...@@ -8,8 +8,8 @@ from vllm.utils import Device, chunk_list
@pytest.mark.parametrize("num_gpu_blocks", [1024]) @pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int, def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str): block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create( allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type, allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks, num_gpu_blocks=num_gpu_blocks,
...@@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int, ...@@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [ cpu_blocks = [
allocator.allocate_mutable(prev_block=None, device=Device.CPU) allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
for _ in range(num_cpu_blocks) for _ in range(num_cpu_blocks)
] ]
assert allocator.get_num_free_blocks(Device.CPU) == 0 assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [ gpu_blocks = [
allocator.allocate_mutable(prev_block=None, device=Device.GPU) allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
for _ in range(num_gpu_blocks) for _ in range(num_gpu_blocks)
] ]
assert allocator.get_num_free_blocks(Device.CPU) == 0 assert allocator.get_num_free_blocks(Device.CPU) == 0
...@@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int, ...@@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
@pytest.mark.parametrize("num_gpu_blocks", [1024]) @pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [2]) @pytest.mark.parametrize("block_size", [2])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int, def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str): block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create( allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type, allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks, num_gpu_blocks=num_gpu_blocks,
...@@ -67,18 +67,18 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int, ...@@ -67,18 +67,18 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [ cpu_blocks = [
allocator.allocate_immutable(prev_block=None, allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids, token_ids=token_ids,
device=Device.CPU) device=Device.CPU)
for token_ids in cpu_token_ids for token_ids in cpu_token_ids
] ]
assert allocator.get_num_free_blocks(Device.CPU) == 0 assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [ gpu_blocks = [
allocator.allocate_immutable(prev_block=None, allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids, token_ids=token_ids,
device=Device.GPU) device=Device.GPU)
for token_ids in gpu_token_ids for token_ids in gpu_token_ids
] ]
assert allocator.get_num_free_blocks(Device.CPU) == 0 assert allocator.get_num_free_blocks(Device.CPU) == 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment