Commit e7c1b7f3 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.5.4-dtk24.04.1'

parents 7462218e 04c62b93
......@@ -2,10 +2,11 @@ cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3.
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
......@@ -16,7 +17,8 @@ pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.1
outlines >= 0.0.43 # Requires torch >= 2.1.0
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
......@@ -2,5 +2,5 @@
-r requirements-common.txt
# Dependencies for x86_64 CPUs
torch == 2.3.1+cpu
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
torch == 2.4.0+cpu; platform_machine != "ppc64le"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
......@@ -4,6 +4,8 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py # for pynvml package
torch == 2.3.0
xformers == 0.0.26.post1 # Requires PyTorch 2.3.0
vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0
torch == 2.4.0
# These must be updated alongside torch
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27.post2 # Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1 # Requires PyTorch 2.4.0
# Mamba dependencies
mamba-ssm>=1.2.2
causal-conv1d>=1.2.0
# Common dependencies
# -r requirements-common.txt
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers < 4.43
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
optimum-intel[openvino] >= 1.18.1
......@@ -2,5 +2,9 @@
-r requirements-common.txt
# Dependencies for AMD GPUs
awscli
boto3
botocore
ray >= 2.10.0
peft
pytest-asyncio
# Needed for Ray accelerated DAG tests
-r requirements-adag.txt
# testing
pytest
tensorizer>=2.9.0
......@@ -14,9 +17,11 @@ peft
requests
ray
sentence-transformers # required for embedding
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
# Benchmarking
aiohttp
# quantization
bitsandbytes==0.42.0
bitsandbytes==0.42.0
\ No newline at end of file
......@@ -4,4 +4,4 @@
# Dependencies for TPU
# Currently, the TPU backend uses a nightly version of PyTorch XLA.
# You can install the dependencies in Dockerfile.tpu.
triton # To avoid import errors
ray
# Common dependencies
-r requirements-common.txt
setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
--- amd_hip_bf16.h 2024-02-06 18:28:58.268699142 +0000
+++ amd_hip_bf16.h.new 2024-02-06 18:28:31.988647133 +0000
@@ -90,10 +90,10 @@
#include "math_fwd.h" // ocml device functions
#if defined(__HIPCC_RTC__)
-#define __HOST_DEVICE__ __device__
+#define __HOST_DEVICE__ __device__ static
#else
#include <climits>
-#define __HOST_DEVICE__ __host__ __device__
+#define __HOST_DEVICE__ __host__ __device__ static inline
#endif
// Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
......@@ -5,6 +5,7 @@ import os
import re
import subprocess
import sys
import warnings
from shutil import which
from typing import Dict, List
......@@ -33,6 +34,34 @@ def load_module_from_path(module_name, path):
ROOT_DIR = os.path.dirname(__file__)
logger = logging.getLogger(__name__)
def embed_commit_hash():
try:
if "BUILDKITE_COMMIT" in os.environ:
# ci build
commit_id = os.environ["BUILDKITE_COMMIT"]
else:
commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
encoding="utf-8").strip()
commit_contents = f'__commit__ = "{commit_id}"\n'
version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
with open(version_file, "w", encoding="utf-8") as f:
f.write(commit_contents)
except subprocess.CalledProcessError as e:
warnings.warn(f"Failed to get commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
except Exception as e:
warnings.warn(f"Failed to embed commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
embed_commit_hash()
# cannot import envs directly because it depends on vllm,
# which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
......@@ -159,9 +188,6 @@ class cmake_build_ext(build_ext):
# match.
cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
if _install_punica():
cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
#
# Setup parallelism and build tool
#
......@@ -240,12 +266,20 @@ def _is_cpu() -> bool:
return VLLM_TARGET_DEVICE == "cpu"
def _is_openvino() -> bool:
return VLLM_TARGET_DEVICE == "openvino"
def _is_xpu() -> bool:
return VLLM_TARGET_DEVICE == "xpu"
def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu()
def _install_punica() -> bool:
return envs.VLLM_INSTALL_PUNICA_KERNELS
def _build_core_ext() -> bool:
return not _is_neuron() and not _is_tpu()
def get_hipcc_rocm_version():
......@@ -346,10 +380,26 @@ def get_version_add(sha: Optional[str] = None) -> str:
lines = file.readlines()
rocm_version=lines[0].replace(".", "")
version += ".dtk" + rocm_version
new_version_content = f"""
import warnings
try:
import vllm.commit_id
__commit__ = vllm.commit_id.__commit__
except Exception as e:
warnings.warn(f"Failed to read commit hash:\\n + str(e)",
RuntimeWarning,
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.5.4"
__dcu_version__ = f'0.5.4+{version}'
"""
with open(add_version_path, encoding="utf-8",mode="w") as file:
file.write("__version__='0.5.0.post1'\n")
file.write("__dcu_version__='0.5.0.post1+{}'\n".format(version))
file.write(new_version_content)
file.close()
......@@ -382,10 +432,14 @@ def get_vllm_version() -> str:
if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}"
elif _is_openvino():
version += "+openvino"
elif _is_tpu():
version += "+tpu"
elif _is_cpu():
version += "+cpu"
elif _is_xpu():
version += "+xpu"
else:
raise RuntimeError("Unknown runtime environment")
......@@ -431,27 +485,32 @@ def get_requirements() -> List[str]:
requirements = _read_requirements("requirements-rocm.txt")
elif _is_neuron():
requirements = _read_requirements("requirements-neuron.txt")
elif _is_openvino():
requirements = _read_requirements("requirements-openvino.txt")
elif _is_tpu():
requirements = _read_requirements("requirements-tpu.txt")
elif _is_cpu():
requirements = _read_requirements("requirements-cpu.txt")
elif _is_xpu():
requirements = _read_requirements("requirements-xpu.txt")
else:
raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
"Unsupported platform, please use CUDA, ROCm, Neuron, "
"OpenVINO, or CPU.")
return requirements
ext_modules = []
if _build_core_ext():
ext_modules.append(CMakeExtension(name="vllm._core_C"))
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
if _install_punica():
ext_modules.append(CMakeExtension(name="vllm._punica_C"))
package_data = {
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
}
......@@ -478,6 +537,7 @@ setup(
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
......@@ -489,6 +549,11 @@ setup(
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
},
cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
package_data=package_data,
)
entry_points={
"console_scripts": [
"vllm=vllm.scripts:main",
],
},
)
\ No newline at end of file
"""vllm.entrypoints.api_server with some extra logging for testing."""
import argparse
from typing import Any, Dict
import uvicorn
......@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response
import vllm.entrypoints.api_server
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.utils import FlexibleArgumentParser
app = vllm.entrypoints.api_server.app
......@@ -33,7 +33,7 @@ def stats() -> Response:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = FlexibleArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser = AsyncEngineArgs.add_cli_args(parser)
......
......@@ -2,8 +2,13 @@ import asyncio
from dataclasses import dataclass
import pytest
import torch
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm import SamplingParams
from vllm.config import ParallelConfig
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from ..utils import wait_for_gpu_memory_to_clear
@dataclass
......@@ -19,8 +24,11 @@ class MockEngine:
self.add_request_calls = 0
self.abort_request_calls = 0
self.request_id = None
# Ugly, remove dependency when possible
self.parallel_config = ParallelConfig(1, 1, False)
async def step_async(self):
async def step_async(self, virtual_engine):
# PP size is 1, ignore virtual engine
self.step_calls += 1
return [RequestOutput(
request_id=self.request_id)] if self.request_id else []
......@@ -28,6 +36,9 @@ class MockEngine:
async def process_model_inputs_async(self, *args, **kwargs):
pass
async def stop_remote_worker_execution_loop_async(self):
pass
def generate(self, request_id):
self.request_id = request_id
......@@ -37,6 +48,7 @@ class MockEngine:
def add_request(self, **kwargs):
del kwargs # Unused
self.add_request_calls += 1
print(f'Request calls: {self.add_request_calls}')
async def add_request_async(self, **kwargs):
self.add_request_calls += 1
......@@ -49,6 +61,9 @@ class MockEngine:
def has_unfinished_requests(self):
return self.request_id is not None
def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
return self.request_id is not None
class MockAsyncLLMEngine(AsyncLLMEngine):
......@@ -72,6 +87,7 @@ async def test_new_requests_event():
engine.engine.generate("2")
await asyncio.sleep(0)
await asyncio.sleep(0)
await asyncio.sleep(0)
assert engine.engine.add_request_calls == 2
assert engine.engine.step_calls >= 2
await asyncio.sleep(0.001)
......@@ -94,3 +110,35 @@ async def test_new_requests_event():
assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None
def test_asyncio_run():
wait_for_gpu_memory_to_clear(
devices=list(range(torch.cuda.device_count())),
threshold_bytes=2 * 2**30,
timeout_s=60,
)
engine = AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m"))
async def run(prompt: str):
sampling_params = SamplingParams(
temperature=0,
max_tokens=32,
)
async for output in engine.generate(prompt,
sampling_params,
request_id=prompt):
final_output = output
return final_output
async def generate():
return await asyncio.gather(
run("test0"),
run("test1"),
)
results = asyncio.run(generate())
assert len(results) == 2
import os
import pathlib
from dataclasses import dataclass
import pytest
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.transformers_utils.tokenizer import get_tokenizer
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
......@@ -50,24 +49,9 @@ TEST_MESSAGES = [
]
@dataclass
class MockTokenizer:
chat_template = None
@dataclass
class MockServingChat:
tokenizer: MockTokenizer
def test_load_chat_template():
# Testing chatml template
tokenizer = MockTokenizer()
mock_serving_chat = MockServingChat(tokenizer)
OpenAIServingChat._load_chat_template(mock_serving_chat,
chat_template=chatml_jinja_path)
template_content = tokenizer.chat_template
template_content = load_chat_template(chat_template=chatml_jinja_path)
# Test assertions
assert template_content is not None
......@@ -79,24 +63,16 @@ def test_load_chat_template():
def test_no_load_chat_template_filelike():
# Testing chatml template
template = "../../examples/does_not_exist"
tokenizer = MockTokenizer()
mock_serving_chat = MockServingChat(tokenizer)
with pytest.raises(ValueError, match="looks like a file path"):
OpenAIServingChat._load_chat_template(mock_serving_chat,
chat_template=template)
load_chat_template(chat_template=template)
def test_no_load_chat_template_literallike():
# Testing chatml template
template = "{{ messages }}"
tokenizer = MockTokenizer()
mock_serving_chat = MockServingChat(tokenizer)
OpenAIServingChat._load_chat_template(mock_serving_chat,
chat_template=template)
template_content = tokenizer.chat_template
template_content = load_chat_template(chat_template=template)
assert template_content == template
......@@ -108,9 +84,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output):
# Initialize the tokenizer
tokenizer = get_tokenizer(tokenizer_name=model)
mock_serving_chat = MockServingChat(tokenizer)
OpenAIServingChat._load_chat_template(mock_serving_chat,
chat_template=template)
template_content = load_chat_template(chat_template=template)
# Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest(
......@@ -122,7 +96,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
result = tokenizer.apply_chat_template(
conversation=mock_request.messages,
tokenize=False,
add_generation_prompt=mock_request.add_generation_prompt)
add_generation_prompt=mock_request.add_generation_prompt,
chat_template=mock_request.chat_template or template_content)
# Test assertion
assert result == expected_output, (
......
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
from ..utils import VLLM_PATH, RemoteOpenAIServer
from ..utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()
@pytest.fixture(scope="module")
def server(ray_ctx):
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
......@@ -29,7 +17,10 @@ def server(ray_ctx):
"2048",
"--enforce-eager",
"--engine-use-ray"
])
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
......
......@@ -8,12 +8,14 @@ import weakref
import pytest
from vllm import LLM
from vllm.utils import is_hip
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
def test_vllm_gc_ed():
......@@ -27,6 +29,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
......@@ -35,13 +38,16 @@ def test_models(
vllm_runner,
example_prompts,
model: str,
backend: str,
dtype: str,
max_tokens: int,
enforce_eager: bool,
) -> None:
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
pytest.skip("Skipping non-eager test for FlashInferBackend.")
if backend == "FLASHINFER" and is_hip():
pytest.skip("Flashinfer does not support ROCm/HIP.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
......@@ -52,10 +58,9 @@ def test_models(
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
......@@ -8,6 +8,8 @@ Run `pytest tests/models/test_chunked_prefill.py`.
"""
import pytest
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
......@@ -54,10 +56,9 @@ def test_models(
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
import pytest
from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings
def test_cpu_offload():
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
["--cpu-offload-gb", "4"])
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "2"])
# Test loading a quantized checkpoint
compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
["--cpu-offload-gb", "2"])
@pytest.mark.skipif(not is_quant_method_supported("awq"),
reason="awq is not supported on this GPU type.")
def test_cpu_offload_awq():
compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
["--cpu-offload-gb", "2"])
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors():
# Test wNa16
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
["--cpu-offload-gb", "1"])
# Test w4a16_marlin24
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
[], ["--cpu-offload-gb", "1"])
# Test w8a8
compare_two_settings(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
["--cpu-offload-gb", "1"])
......@@ -12,6 +12,8 @@ from vllm import SamplingParams
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
ENABLE_ARTIFICIAL_PREEMPT)
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
]
......@@ -54,8 +56,8 @@ def test_chunked_prefill_recompute(
max_num_seqs=max_num_seqs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
......@@ -90,18 +92,18 @@ def test_preemption(
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
"is not enough KV cache space." in caplog_vllm.text)
# Ensure the count bucket of request-level histogram metrics matches
......@@ -147,10 +149,10 @@ def test_swap(
) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts,
beam_width, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i]
......@@ -215,8 +217,8 @@ def test_swap_infeasible(
example_prompts,
sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang.
assert req_outputs[0].outputs[0].finish_reason == "length"
......@@ -254,8 +256,8 @@ def test_preemption_infeasible(
sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang.
for req_output in req_outputs:
......
import contextlib
import gc
import os
from typing import Any, Dict, List, Optional, Tuple, TypeVar
import sys
from collections import UserList
from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union
import pytest
import torch
......@@ -9,18 +11,19 @@ import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoProcessor, AutoTokenizer, BatchEncoding)
AutoTokenizer, BatchEncoding, BatchFeature)
from vllm import LLM, SamplingParams
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
from vllm.assets.image import ImageAsset
from vllm.config import TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import TextPrompt
from vllm.logger import init_logger
from vllm.multimodal import MultiModalData
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
from vllm.sequence import SampleLogprobs
from vllm.utils import cuda_device_count_stateless, is_cpu
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
is_cpu)
logger = init_logger(__name__)
......@@ -28,22 +31,6 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
# Multi modal related
# You can use `.buildkite/download-images.sh` to download the assets
PIXEL_VALUES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
]
IMAGE_FEATURES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
]
IMAGE_FILES = [
os.path.join(_TEST_DIR, "images", filename)
for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
]
assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
def _read_prompts(filename: str) -> List[str]:
with open(filename, "r") as f:
......@@ -51,6 +38,50 @@ def _read_prompts(filename: str) -> List[str]:
return prompts
class _ImageAssetPrompts(TypedDict):
stop_sign: str
cherry_blossom: str
if sys.version_info < (3, 9):
# UserList cannot be subscripted
class _ImageAssetsBase(UserList):
pass
else:
class _ImageAssetsBase(UserList[ImageAsset]):
pass
class _ImageAssets(_ImageAssetsBase):
def __init__(self) -> None:
super().__init__([
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
])
def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
"""
Convenience method to define the prompt for each test image.
The order of the returned prompts matches the order of the
assets when iterating through this object.
"""
return [prompts["stop_sign"], prompts["cherry_blossom"]]
IMAGE_ASSETS = _ImageAssets()
"""Singleton instance of :class:`_ImageAssets`."""
@pytest.fixture(autouse=True)
def init_test_http_connection():
# pytest_asyncio may use a different event loop per test
# so we need to make sure the async client is created anew
global_http_connection.reuse_client = False
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
......@@ -81,31 +112,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup()
@pytest.fixture(scope="session")
def hf_images() -> List[Image.Image]:
return [Image.open(filename) for filename in IMAGE_FILES]
@pytest.fixture()
def vllm_images(request) -> List[MultiModalData]:
vision_language_config = request.getfixturevalue("model_and_config")[1]
if vision_language_config.image_input_type == (
VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
return [
ImageFeatureData(torch.load(filename))
for filename in IMAGE_FEATURES_FILES
]
else:
return [
ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
]
@pytest.fixture()
def vllm_image_tensors(request) -> List[torch.Tensor]:
return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
@pytest.fixture
def example_prompts() -> List[str]:
prompts = []
......@@ -122,13 +128,12 @@ def example_long_prompts() -> List[str]:
return prompts
_STR_DTYPE_TO_TORCH_DTYPE = {
"half": torch.half,
"bfloat16": torch.bfloat16,
"float": torch.float,
}
@pytest.fixture(scope="session")
def image_assets() -> _ImageAssets:
return IMAGE_ASSETS
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
class HfRunner:
......@@ -144,11 +149,11 @@ class HfRunner:
model_name: str,
dtype: str = "half",
*,
model_kwargs: Optional[Dict[str, Any]] = None,
is_embedding_model: bool = False,
is_vision_model: bool = False,
) -> None:
assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
self.model_name = model_name
......@@ -166,11 +171,13 @@ class HfRunner:
else:
auto_cls = AutoModelForCausalLM
model_kwargs = model_kwargs if model_kwargs is not None else {}
self.model = self.wrap_device(
auto_cls.from_pretrained(
model_name,
torch_dtype=torch_dtype,
trust_remote_code=True,
**model_kwargs,
))
self.tokenizer = AutoTokenizer.from_pretrained(
......@@ -180,6 +187,9 @@ class HfRunner:
)
try:
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401
self.processor = AutoProcessor.from_pretrained(
model_name,
torch_dtype=torch_dtype,
......@@ -195,7 +205,7 @@ class HfRunner:
self,
prompts: List[str],
images: Optional[List[Image.Image]] = None,
**kwargs,
**kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]:
if images:
assert len(prompts) == len(images)
......@@ -230,11 +240,13 @@ class HfRunner:
prompts: List[str],
max_tokens: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str]]:
outputs = self.generate(prompts,
do_sample=False,
max_new_tokens=max_tokens,
images=images)
images=images,
**kwargs)
return [(output_ids[0], output_str[0])
for output_ids, output_str in outputs]
......@@ -264,19 +276,30 @@ class HfRunner:
self,
prompts: List[str],
max_tokens: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[List[torch.Tensor]]:
all_logprobs = []
for prompt in prompts:
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
all_logprobs: List[List[torch.Tensor]] = []
for i, prompt in enumerate(prompts):
processor_kwargs: Dict[str, Any] = {
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
output = self.model.generate(
self.wrap_device(input_ids),
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
output_hidden_states=True,
return_dict_in_generate=True,
**kwargs,
)
seq_logprobs = []
seq_logprobs: List[torch.Tensor] = []
for hidden_states in output.hidden_states:
last_hidden_states = hidden_states[-1][0]
logits = torch.matmul(
......@@ -296,20 +319,31 @@ class HfRunner:
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[List[Image.Image]] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
for prompt in prompts:
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
for i, prompt in enumerate(prompts):
processor_kwargs: Dict[str, Any] = {
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
output = self.model.generate(
self.wrap_device(input_ids),
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
output_hidden_states=True,
return_dict_in_generate=True,
**kwargs,
)
seq_logprobs: List[torch.Tensor] = []
......@@ -342,7 +376,7 @@ class HfRunner:
all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0]
output_len = seq_ids.shape[0] - input_ids.shape[1]
output_len = len(seq_logprobs_lst)
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
......@@ -362,7 +396,7 @@ class HfRunner:
cleanup()
@pytest.fixture
@pytest.fixture(scope="session")
def hf_runner():
return HfRunner
......@@ -382,6 +416,7 @@ class VllmRunner:
block_size: int = 16,
enable_chunked_prefill: bool = False,
swap_space: int = 4,
enforce_eager: bool = False,
**kwargs,
) -> None:
self.model = LLM(
......@@ -390,6 +425,7 @@ class VllmRunner:
trust_remote_code=True,
dtype=dtype,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len,
......@@ -402,7 +438,7 @@ class VllmRunner:
self,
prompts: List[str],
sampling_params: SamplingParams,
images: Optional[List[MultiModalData]] = None,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[List[int]], List[str]]]:
if images is not None:
assert len(prompts) == len(images)
......@@ -410,7 +446,7 @@ class VllmRunner:
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = image
inputs[i]["multi_modal_data"] = {"image": image}
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params)
......@@ -423,7 +459,7 @@ class VllmRunner:
req_sample_output_strs: List[str] = []
for sample in req_output.outputs:
output_str = sample.text
output_ids = sample.token_ids
output_ids = list(sample.token_ids)
req_sample_output_ids.append(prompt_ids + output_ids)
req_sample_output_strs.append(prompt_str + output_str)
outputs.append((req_sample_output_ids, req_sample_output_strs))
......@@ -433,10 +469,19 @@ class VllmRunner:
self,
prompts: List[str],
sampling_params: SamplingParams,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
assert sampling_params.logprobs is not None
req_outputs = self.model.generate(prompts,
if images is not None:
assert len(prompts) == len(images)
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = {"image": image}
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params)
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
for req_output in req_outputs:
......@@ -451,7 +496,7 @@ class VllmRunner:
self,
prompts: List[str],
max_tokens: int,
images: Optional[List[MultiModalData]] = None,
images: Optional[List[Image.Image]] = None,
) -> List[Tuple[List[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, greedy_params, images=images)
......@@ -463,11 +508,17 @@ class VllmRunner:
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[Union[List[Image.Image],
List[List[Image.Image]]]] = None,
stop_token_ids: Optional[List[int]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
greedy_logprobs_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens,
logprobs=num_logprobs)
outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
logprobs=num_logprobs,
stop_token_ids=stop_token_ids)
outputs = self.generate_w_logprobs(prompts,
greedy_logprobs_params,
images=images)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
......@@ -513,6 +564,10 @@ def get_tokenizer_pool_config(tokenizer_group_type):
return TokenizerPoolConfig(pool_size=1,
pool_type="ray",
extra_config={})
if isinstance(tokenizer_group_type, type):
return TokenizerPoolConfig(pool_size=1,
pool_type=tokenizer_group_type,
extra_config={})
raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment