Commit af7f4372 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

parents 5e19cdef 09c77926
[build-system]
# Should be mirrored in requirements-build.txt
requires = [
"cmake>=3.21",
"cmake>=3.26",
"ninja",
"packaging",
"setuptools >= 49.4.0",
"torch == 2.4.0",
"wheel",
"jinja2",
]
build-backend = "setuptools.build_meta"
......@@ -56,6 +57,7 @@ files = [
"vllm/*.py",
"vllm/adapter_commons",
"vllm/assets",
"vllm/entrypoints",
"vllm/inputs",
"vllm/logging",
"vllm/multimodal",
......
# Should be mirrored in pyproject.toml
cmake>=3.21
cmake>=3.26
ninja
packaging
setuptools>=49.4.0
torch==2.4.0
wheel
jinja2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
......@@ -8,17 +6,23 @@ tqdm
py-cpuinfo
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi
aiohttp
openai
openai >= 1.0 # Ensure modern openai package (ensure types module present)
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pydantic >= 2.8 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
lm-format-enforcer == 0.10.6
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
msgspec
librosa # Required for audio processing
soundfile # Required for audio processing
gguf == 0.9.1
importlib_metadata
......@@ -7,5 +7,5 @@ nvidia-ml-py # for pynvml package
torch == 2.4.0
# These must be updated alongside torch
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27.post2 # Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1 # Requires PyTorch 2.4.0
xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
......@@ -8,7 +8,7 @@ isort==5.13.2
clang-format==18.1.5
# type checking
mypy==1.9.0
mypy==1.11.1
types-PyYAML
types-requests
types-setuptools
# Common dependencies
# -r requirements-common.txt
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers < 4.43
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
-r requirements-common.txt
# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
optimum-intel[openvino] >= 1.18.1
openvino ~= 2024.3.0
optimum-intel[openvino] >= 1.18.2
......@@ -11,7 +11,7 @@ pytest-shard
# testing utils
awscli
einops # required for MPT
einops # required for MPT and qwen-vl
httpx
peft
requests
......@@ -19,9 +19,15 @@ ray
sentence-transformers # required for embedding
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
# Benchmarking
aiohttp
# quantization
bitsandbytes==0.42.0
buildkite-test-collector==0.1.8
\ No newline at end of file
......@@ -68,9 +68,12 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
# vLLM only supports Linux platform
assert sys.platform.startswith(
"linux"), "vLLM only supports Linux platform (including WSL)."
if not sys.platform.startswith("linux"):
logger.warning(
"vLLM only supports Linux platform (including WSL). "
"Building on %s, "
"so vLLM may not be able to run correctly", sys.platform)
VLLM_TARGET_DEVICE = "empty"
MAIN_CUDA_VERSION = "12.1"
......@@ -188,6 +191,10 @@ class cmake_build_ext(build_ext):
# match.
cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
# Pass the python path to cmake so it can reuse the build dependencies
# on subsequent calls to python.
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
#
# Setup parallelism and build tool
#
......@@ -238,6 +245,10 @@ class cmake_build_ext(build_ext):
subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty"
def _is_cuda() -> bool:
has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
......@@ -279,7 +290,7 @@ def _build_custom_ops() -> bool:
def _build_core_ext() -> bool:
return not _is_neuron() and not _is_tpu()
return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
def get_hipcc_rocm_version():
......@@ -398,13 +409,13 @@ try:
import vllm.commit_id
__commit__ = vllm.commit_id.__commit__
except Exception as e:
warnings.warn(f"Failed to read commit hash:\\n + str(e)",
warnings.warn(f"Failed to read commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.5.4"
__dcu_version__ = f'0.5.4+{version}'
__version__ = "0.5.5"
__dcu_version__ = f'0.5.5+{version}'
"""
......@@ -424,7 +435,9 @@ def get_version():
def get_vllm_version() -> str:
# version = find_version(get_path("vllm", "version.py"))
if _is_cuda():
if _no_device():
version += "+empty"
elif _is_cuda():
cuda_version = str(get_nvcc_cuda_version())
if cuda_version != MAIN_CUDA_VERSION:
cuda_version_str = cuda_version.replace(".", "")[:3]
......@@ -479,7 +492,9 @@ def get_requirements() -> List[str]:
resolved_requirements.append(line)
return resolved_requirements
if _is_cuda():
if _no_device():
requirements = _read_requirements("requirements-cuda.txt")
elif _is_cuda():
requirements = _read_requirements("requirements-cuda.txt")
cuda_major, cuda_minor = torch.version.cuda.split(".")
modified_requirements = []
......@@ -528,6 +543,9 @@ if envs.VLLM_USE_PRECOMPILED:
ext_modules = []
package_data["vllm"].append("*.so")
if _no_device():
ext_modules = []
setup(
name="vllm",
version=get_vllm_version(),
......
"""vllm.entrypoints.api_server with some extra logging for testing."""
from typing import Any, Dict
from typing import Any, Dict, Iterable
import uvicorn
from fastapi.responses import JSONResponse, Response
......@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
super().__init__(*args, **kwargs)
self._num_aborts = 0
async def abort(self, request_id: str) -> None:
await super().abort(request_id)
self._num_aborts += 1
async def _engine_abort(self, request_ids: Iterable[str]):
ids = list(request_ids)
self._num_aborts += len(ids)
await super()._engine_abort(ids)
def testing_stats(self) -> Dict[str, Any]:
return {"num_aborted_requests": self._num_aborts}
......
import os
import subprocess
import sys
import time
......@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
"127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size)
]
# Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
# to prevent `--engine-use-ray` raises an exception due to it deprecation
env_vars = os.environ.copy()
env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
if engine_use_ray:
commands.append("--engine-use-ray")
if worker_use_ray:
commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands)
uvicorn_process = subprocess.Popen(commands, env=env_vars)
yield
uvicorn_process.terminate()
......
import asyncio
import os
from asyncio import CancelledError
from dataclasses import dataclass
from typing import Optional
import pytest
import pytest_asyncio
import torch
from vllm import SamplingParams
from vllm.config import ParallelConfig
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from vllm.outputs import RequestOutput as RealRequestOutput
from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear
......@@ -106,21 +112,49 @@ async def test_new_requests_event():
assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == old_step_calls + 1
# Allow deprecated engine_use_ray to not raise exception
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None
os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
def test_asyncio_run():
def start_engine():
wait_for_gpu_memory_to_clear(
devices=list(range(torch.cuda.device_count())),
threshold_bytes=2 * 2**30,
timeout_s=60,
)
engine = AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m"))
return AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
@pytest_asyncio.fixture(scope="module")
async def async_engine():
engine = await asyncio.get_event_loop().run_in_executor(executor=None,
func=start_engine)
try:
yield engine
finally:
engine.shutdown_background_loop()
del engine
await asyncio.sleep(0.1)
cleanup()
@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
# So we can share the async engine fixture between these tests
return False
@pytest.mark.asyncio(scope="module")
async def test_asyncio_run(async_engine):
async def run(prompt: str):
sampling_params = SamplingParams(
......@@ -128,17 +162,64 @@ def test_asyncio_run():
max_tokens=32,
)
async for output in engine.generate(prompt,
async for output in async_engine.generate(prompt,
sampling_params,
request_id=prompt):
final_output = output
return final_output
async def generate():
return await asyncio.gather(
results = await asyncio.gather(
run("test0"),
run("test1"),
)
results = asyncio.run(generate())
assert len(results) == 2
@pytest.mark.asyncio(scope="module")
async def test_cancellation(async_engine):
sampling_params = SamplingParams(
temperature=0,
min_tokens=10,
max_tokens=10,
)
i = 0
with pytest.raises(CancelledError):
async for output in async_engine.generate("test2",
sampling_params,
request_id="test2"):
assert not output.finished
i += 1
if i == 5:
await async_engine.abort("test2")
assert i == 5
@pytest.mark.asyncio(scope="module")
async def test_delayed_generator(async_engine):
sampling_params = SamplingParams(
temperature=0,
min_tokens=10,
max_tokens=10,
)
stream = async_engine.generate("test3",
sampling_params,
request_id="test3")
i = 0
final_output: Optional[RealRequestOutput] = None
async for output in stream:
final_output = output
if i == 0:
# wait for generation to complete before consuming
# the remaining messages
await asyncio.sleep(1)
if i < 9:
assert not output.finished
i += 1
assert i == 10
assert final_output is not None
assert len(final_output.outputs[0].token_ids) == 10
assert final_output.finished
import os
import pathlib
import pytest
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.transformers_utils.tokenizer import get_tokenizer
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
__file__))).parent.parent / "examples/template_chatml.jinja"
from ..utils import VLLM_PATH
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [
("facebook/opt-125m", None, True,
"Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", None, False,
"Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
......@@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
add_generation_prompt=add_generation_prompt)
# Call the function and get the result
result = tokenizer.apply_chat_template(
result = apply_chat_template(
tokenizer,
conversation=mock_request.messages,
tokenize=False,
chat_template=mock_request.chat_template or template_content,
add_generation_prompt=mock_request.add_generation_prompt,
chat_template=mock_request.chat_template or template_content)
)
# Test assertion
assert result == expected_output, (
......
import openai # use the official client for correctness check
import pytest
from ..utils import RemoteOpenAIServer
from ..utils import VLLM_PATH, RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
@pytest.fixture(scope="module")
......@@ -16,10 +18,16 @@ def server():
"--max-model-len",
"2048",
"--enforce-eager",
"--engine-use-ray"
"--engine-use-ray",
"--chat-template",
str(chatml_jinja_path),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
# Allow `--engine-use-ray`, otherwise the launch of the server throw
# an error due to try to use a deprecated feature
env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
with RemoteOpenAIServer(MODEL_NAME, args,
env_dict=env_dict) as remote_server:
yield remote_server
......@@ -83,7 +91,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI):
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=13, total_tokens=23)
completion_tokens=10, prompt_tokens=55, total_tokens=65)
message = choice.message
assert message.content is not None and len(message.content) >= 10
......
......@@ -10,23 +10,23 @@ async def test_request_tracker():
stream_1 = tracker.add_request("1")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, finished = tracker.get_new_and_finished_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert len(new) == 1
assert new[0]["request_id"] == "1"
assert not finished
assert not aborted
assert not stream_1.finished
stream_2 = tracker.add_request("2")
stream_3 = tracker.add_request("3")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, finished = tracker.get_new_and_finished_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert len(new) == 2
assert new[0]["request_id"] == "2"
assert new[1]["request_id"] == "3"
assert not finished
assert not aborted
assert not stream_2.finished
assert not stream_3.finished
......@@ -36,9 +36,9 @@ async def test_request_tracker():
assert not tracker.new_requests_event.is_set()
tracker.abort_request("1")
new, finished = tracker.get_new_and_finished_requests()
assert len(finished) == 1
assert "1" in finished
new, aborted = tracker.get_new_and_aborted_requests()
assert len(aborted) == 1
assert "1" in aborted
assert not new
assert stream_1.finished
......@@ -46,9 +46,11 @@ async def test_request_tracker():
tracker.abort_request("4")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, finished = tracker.get_new_and_finished_requests()
assert len(finished) == 1
assert "4" in finished
new, aborted = tracker.get_new_and_aborted_requests()
# aborted new requests will cancel each other out -
# there's no need for them to propagate into the
# engine
assert not aborted
assert not new
assert stream_4.finished
......@@ -57,10 +59,9 @@ async def test_request_tracker():
tracker.process_request_output(
RequestOutput("2", "output", [], [], [], finished=True))
await tracker.wait_for_new_requests()
new, finished = tracker.get_new_and_finished_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert len(finished) == 1
assert "2" in finished
assert not aborted
assert len(new) == 1
assert new[0]["request_id"] == "5"
assert stream_2.finished
......
......@@ -6,14 +6,27 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
import pytest
from ..models.utils import check_outputs_equal
from ..models.utils import check_logprobs_close, check_outputs_equal
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]
E5M2_KV_MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-chat-hf",
]
E4M3_KV_MODELS = [
"meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
]
KV_CACHE_QUANTIZATION_PATHS = {
"meta-llama/Llama-2-7b-chat-hf":
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
}
@pytest.mark.parametrize("model", MODELS)
......@@ -35,11 +48,11 @@ def test_models(
enforce_eager: bool,
tensor_parallel_size: int,
) -> None:
max_num_seqs = min(chunked_prefill_token_size, 256)
enable_chunked_prefill = False
max_num_batched_tokens = None
if chunked_prefill_token_size != -1:
enable_chunked_prefill = True
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with hf_runner(model, dtype=dtype) as hf_model:
......@@ -49,7 +62,7 @@ def test_models(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
......@@ -62,3 +75,78 @@ def test_models(
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("kv_cache_dtype,model",
[("fp8_e5m2", m)
for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
for m in E4M3_KV_MODELS])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
@pytest.mark.parametrize("enforce_eager", [False, True])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
def test_models_with_fp8_kv_cache(
vllm_runner,
example_prompts,
kv_cache_dtype: str,
model: str,
max_tokens: int,
chunked_prefill_token_size: int,
enforce_eager: bool,
tensor_parallel_size: int,
) -> None:
"""
Only checks log probs match between chunked-prefill and
non-chunked-prefill version of vLLM model runner.
This test is used when there is discrepancy in kernels
/ numerics (e.g. when using lower-precision types like FP8).
"""
NUM_LOG_PROBS = 8
if model == "facebook/opt-125m":
pytest.skip(
"#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
)
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
extra_kwargs = {}
if model in KV_CACHE_QUANTIZATION_PATHS:
extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
model]
with vllm_runner(
model,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype,
**extra_kwargs,
) as vllm_model:
no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
model,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype,
**extra_kwargs,
) as vllm_model:
chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close(
outputs_0_lst=no_chunked_prefill_outputs,
outputs_1_lst=chunked_prefill_outputs,
name_0="no_chunked_prefill",
name_1="chunked_prefill",
)
import pytest
from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings
def test_cpu_offload():
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
["--cpu-offload-gb", "4"])
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "2"])
# Test loading a quantized checkpoint
compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
["--cpu-offload-gb", "2"])
@pytest.mark.skipif(not is_quant_method_supported("awq"),
reason="awq is not supported on this GPU type.")
def test_cpu_offload_awq():
compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
["--cpu-offload-gb", "2"])
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors():
# Test wNa16
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
["--cpu-offload-gb", "1"])
# Test w4a16_marlin24
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
[], ["--cpu-offload-gb", "1"])
# Test w8a8
compare_two_settings(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
["--cpu-offload-gb", "1"])
......@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
import pytest
from prometheus_client import REGISTRY
import vllm.envs as envs
from vllm import SamplingParams
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
ENABLE_ARTIFICIAL_PREEMPT)
......@@ -24,6 +25,13 @@ assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"tests/basic_correctness/test_preemption.py`")
@pytest.fixture
def worker_use_ray() -> bool:
# When SPMD worker is used, use ray_use_worker=True
# to test delta input optimization works with preemption.
return envs.VLLM_USE_RAY_SPMD_WORKER
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
......@@ -36,6 +44,7 @@ def test_chunked_prefill_recompute(
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
worker_use_ray: bool,
) -> None:
"""Ensure that chunked prefill works with preemption."""
max_num_seqs = min(chunked_prefill_token_size, 256)
......@@ -54,6 +63,7 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
worker_use_ray=worker_use_ray,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
......@@ -80,6 +90,7 @@ def test_preemption(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
) -> None:
"""By default, recompute preemption is enabled"""
......@@ -90,6 +101,7 @@ def test_preemption(
model,
dtype=dtype,
disable_log_stats=False,
worker_use_ray=worker_use_ray,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
......@@ -134,6 +146,7 @@ def test_swap(
dtype: str,
max_tokens: int,
beam_width: int,
worker_use_ray: bool,
) -> None:
"""Use beam search enables swapping."""
example_prompts = example_prompts[:1]
......@@ -146,6 +159,7 @@ def test_swap(
dtype=dtype,
swap_space=10,
disable_log_stats=False,
worker_use_ray=worker_use_ray,
) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts,
beam_width, max_tokens)
......@@ -191,6 +205,7 @@ def test_swap_infeasible(
dtype: str,
max_tokens: int,
beam_width: int,
worker_use_ray: bool,
) -> None:
"""Verify infeasible swap request will be ignored."""
BLOCK_SIZE = 16
......@@ -207,6 +222,7 @@ def test_swap_infeasible(
# decode blocks are not enough to finish.
num_gpu_blocks_override=prefill_blocks + decode_blocks,
max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
worker_use_ray=worker_use_ray,
) as vllm_model:
sampling_params = SamplingParams(n=beam_width,
use_beam_search=True,
......@@ -234,6 +250,7 @@ def test_preemption_infeasible(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
) -> None:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16
......@@ -248,6 +265,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
worker_use_ray=worker_use_ray,
) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
......
import os
import pytest
@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
def test_full_graph(model):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model="meta-llama/Meta-Llama-3-8B")
llm.generate(prompts, sampling_params)
import contextlib
import gc
import json
import os
import sys
import tempfile
from collections import UserList
from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union
from enum import Enum
from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
TypeVar, Union)
import numpy as np
import pytest
import torch
import torch.nn as nn
import torch.nn.functional as F
from huggingface_hub import snapshot_download
from PIL import Image
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoTokenizer, BatchEncoding, BatchFeature)
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
BatchFeature)
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import TextPrompt
destroy_model_parallel,
init_distributed_environment,
initialize_model_parallel)
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.sequence import SampleLogprobs
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
is_cpu)
identity, is_cpu)
logger = init_logger(__name__)
......@@ -82,6 +92,21 @@ def init_test_http_connection():
global_http_connection.reuse_client = False
@pytest.fixture
def dist_init():
temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend="nccl",
)
initialize_model_parallel(1, 1)
yield
cleanup()
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
......@@ -120,6 +145,46 @@ def example_prompts() -> List[str]:
return prompts
class DecoderPromptType(Enum):
"""For encoder/decoder models only."""
CUSTOM = 1
NONE = 2
EMPTY_STR = 3
@pytest.fixture
def example_encoder_decoder_prompts(
) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
decoder prompt) tuple.
Returns:
* Encoder prompt list
* Decoder prompt list (reverse of encoder prompt list)
'''
encoder_prompts = []
for filename in _TEST_PROMPTS:
encoder_prompts += _read_prompts(filename)
custom_decoder_prompts = encoder_prompts[::-1]
empty_str_decoder_prompts = [""] * len(encoder_prompts)
none_decoder_prompts = [None] * len(encoder_prompts)
# NONE decoder prompt type
return {
DecoderPromptType.NONE:
zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
DecoderPromptType.EMPTY_STR:
zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
DecoderPromptType.CUSTOM:
zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
}
@pytest.fixture
def example_long_prompts() -> List[str]:
prompts = []
......@@ -151,7 +216,9 @@ class HfRunner:
*,
model_kwargs: Optional[Dict[str, Any]] = None,
is_embedding_model: bool = False,
is_vision_model: bool = False,
auto_cls=AutoModelForCausalLM,
postprocess_inputs: Callable[[BatchEncoding],
BatchEncoding] = identity,
) -> None:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
......@@ -166,11 +233,6 @@ class HfRunner:
device="cpu",
).to(dtype=torch_dtype))
else:
if is_vision_model:
auto_cls = AutoModelForVision2Seq
else:
auto_cls = AutoModelForCausalLM
model_kwargs = model_kwargs if model_kwargs is not None else {}
self.model = self.wrap_device(
auto_cls.from_pretrained(
......@@ -195,12 +257,14 @@ class HfRunner:
torch_dtype=torch_dtype,
trust_remote_code=True,
)
except Exception:
except Exception as exc:
logger.warning(
"Unable to auto-load processor from HuggingFace for "
"model %s. Using tokenizer instead.", model_name)
"Unable to auto-load HuggingFace processor for model (%s). "
"Using tokenizer instead. Reason: %s", model_name, exc)
self.processor = self.tokenizer
self.postprocess_inputs = postprocess_inputs
def generate(
self,
prompts: List[str],
......@@ -220,6 +284,7 @@ class HfRunner:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
inputs = self.postprocess_inputs(inputs)
output_ids = self.model.generate(
**self.wrap_device(inputs),
......@@ -289,6 +354,7 @@ class HfRunner:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
inputs = self.postprocess_inputs(inputs)
output = self.model.generate(
**self.wrap_device(inputs),
......@@ -314,12 +380,51 @@ class HfRunner:
all_logprobs.append(seq_logprobs)
return all_logprobs
def _hidden_states_to_logprobs(
self,
hidden_states,
num_logprobs,
) -> Tuple[List[Dict[int, float]], int]:
seq_logprobs: List[torch.Tensor] = []
output_len = len(hidden_states)
for _, hidden_state in enumerate(hidden_states):
last_hidden_states = hidden_state[-1][0]
logits = torch.matmul(
last_hidden_states,
self.model.get_output_embeddings().weight.t(),
)
if getattr(self.model.get_output_embeddings(), "bias",
None) is not None:
logits += self.model.get_output_embeddings().bias.unsqueeze(0)
logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
seq_logprobs.append(logprobs)
# convert to dict
seq_logprobs_lst: List[Dict[int, float]] = []
for tok_idx, tok_logprobs in enumerate(seq_logprobs):
# drop prompt logprobs
if tok_idx == 0:
tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
topk = tok_logprobs.topk(num_logprobs)
tok_logprobs_dct = {}
for token_id, logprob in zip(topk.indices[0], topk.values[0]):
tok_logprobs_dct[token_id.item()] = logprob.item()
seq_logprobs_lst.append(tok_logprobs_dct)
return (
seq_logprobs_lst,
output_len,
)
def generate_greedy_logprobs_limit(
self,
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[List[Image.Image]] = None,
audios: Optional[List[Tuple[np.ndarray, int]]] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
all_logprobs: List[List[Dict[int, float]]] = []
......@@ -334,7 +439,13 @@ class HfRunner:
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
if audios is not None:
audio, sr = audios[i]
processor_kwargs["audio"] = audio
processor_kwargs["sampling_rate"] = sr
inputs = self.processor(**processor_kwargs)
inputs = self.postprocess_inputs(inputs)
output = self.model.generate(
**self.wrap_device(inputs),
......@@ -346,37 +457,66 @@ class HfRunner:
**kwargs,
)
seq_logprobs: List[torch.Tensor] = []
for _, hidden_states in enumerate(output.hidden_states):
last_hidden_states = hidden_states[-1][0]
logits = torch.matmul(
last_hidden_states,
self.model.get_output_embeddings().weight.t(),
)
if getattr(self.model.get_output_embeddings(), "bias",
None) is not None:
logits += self.model.get_output_embeddings(
).bias.unsqueeze(0)
logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
seq_logprobs.append(logprobs)
(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(output.hidden_states,
num_logprobs)
# convert to dict
seq_logprobs_lst: List[Dict[int, float]] = []
for tok_idx, tok_logprobs in enumerate(seq_logprobs):
# drop prompt logprobs
if tok_idx == 0:
tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
topk = tok_logprobs.topk(num_logprobs)
all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0]
output_len = len(seq_logprobs_lst)
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
tok_logprobs_dct = {}
for token_id, logprob in zip(topk.indices[0], topk.values[0]):
tok_logprobs_dct[token_id.item()] = logprob.item()
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
seq_logprobs_lst.append(tok_logprobs_dct)
def generate_encoder_decoder_greedy_logprobs_limit(
self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int,
num_logprobs: int,
**kwargs: Any,
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
for (encoder_prompt,
decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts):
encoder_input_ids = self.wrap_device(
self.tokenizer(encoder_prompt, return_tensors="pt").input_ids)
decoder_input_ids = (
None if decoder_prompt is None else self.wrap_device(
self.tokenizer(decoder_prompt,
return_tensors="pt").input_ids))
output = self.model.generate(
encoder_input_ids,
decoder_input_ids=decoder_input_ids,
use_cache=True,
do_sample=False,
max_new_tokens=max_tokens,
output_hidden_states=True,
return_dict_in_generate=True,
**kwargs,
)
(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
num_logprobs)
all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0]
output_len = len(seq_logprobs_lst)
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
......@@ -416,7 +556,7 @@ class VllmRunner:
block_size: int = 16,
enable_chunked_prefill: bool = False,
swap_space: int = 4,
enforce_eager: bool = False,
enforce_eager: Optional[bool] = False,
**kwargs,
) -> None:
self.model = LLM(
......@@ -438,7 +578,8 @@ class VllmRunner:
self,
prompts: List[str],
sampling_params: SamplingParams,
images: Optional[List[Image.Image]] = None,
images: Optional[Union[List[Image.Image],
List[List[Image.Image]]]] = None,
) -> List[Tuple[List[List[int]], List[str]]]:
if images is not None:
assert len(prompts) == len(images)
......@@ -465,11 +606,27 @@ class VllmRunner:
outputs.append((req_sample_output_ids, req_sample_output_strs))
return outputs
def _final_steps_generate_w_logprobs(
self,
req_outputs: List[RequestOutput],
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
for req_output in req_outputs:
for sample in req_output.outputs:
output_str = sample.text
output_ids = list(sample.token_ids)
output_logprobs = sample.logprobs
outputs.append((output_ids, output_str, output_logprobs))
return outputs
def generate_w_logprobs(
self,
prompts: List[str],
sampling_params: SamplingParams,
images: Optional[List[Image.Image]] = None,
images: Optional[Union[List[Image.Image],
List[List[Image.Image]]]] = None,
audios: Optional[Union[List[Tuple[np.ndarray, int]],
List[List[Tuple[np.ndarray, int]]]]] = None
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
assert sampling_params.logprobs is not None
......@@ -481,16 +638,27 @@ class VllmRunner:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = {"image": image}
if audios is not None:
for i, audio in enumerate(audios):
inputs[i]["multi_modal_data"] = {"audio": audio}
req_outputs = self.model.generate(inputs,
sampling_params=sampling_params)
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
for req_output in req_outputs:
for sample in req_output.outputs:
output_str = sample.text
output_ids = sample.token_ids
output_logprobs = sample.logprobs
outputs.append((output_ids, output_str, output_logprobs))
return outputs
return self._final_steps_generate_w_logprobs(req_outputs)
def generate_encoder_decoder_w_logprobs(
self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
sampling_params: SamplingParams,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
'''
Logprobs generation for vLLM encoder/decoder models
'''
assert sampling_params.logprobs is not None
req_outputs = self.model.generate(encoder_decoder_prompts,
sampling_params=sampling_params)
return self._final_steps_generate_w_logprobs(req_outputs)
def generate_greedy(
self,
......@@ -510,6 +678,8 @@ class VllmRunner:
num_logprobs: int,
images: Optional[Union[List[Image.Image],
List[List[Image.Image]]]] = None,
audios: Optional[Union[List[Tuple[np.ndarray, int]],
List[List[Tuple[np.ndarray, int]]]]] = None,
stop_token_ids: Optional[List[int]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
greedy_logprobs_params = SamplingParams(temperature=0.0,
......@@ -518,7 +688,28 @@ class VllmRunner:
stop_token_ids=stop_token_ids)
outputs = self.generate_w_logprobs(prompts,
greedy_logprobs_params,
images=images)
images=images,
audios=audios)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
def generate_encoder_decoder_greedy_logprobs(
self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int,
num_logprobs: int,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
greedy_logprobs_params = SamplingParams(temperature=0.0,
use_beam_search=False,
max_tokens=max_tokens,
logprobs=num_logprobs)
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
outputs = self.generate_encoder_decoder_w_logprobs(
encoder_decoder_prompts, greedy_logprobs_params)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
......@@ -593,3 +784,26 @@ def num_gpus_available():
in current process."""
return cuda_device_count_stateless()
temp_dir = tempfile.gettempdir()
_dummy_path = os.path.join(temp_dir, "dummy_opt")
@pytest.fixture
def dummy_opt_path():
json_path = os.path.join(_dummy_path, "config.json")
if not os.path.exists(_dummy_path):
snapshot_download(repo_id="facebook/opt-125m",
local_dir=_dummy_path,
ignore_patterns=[
"*.bin", "*.bin.index.json", "*.pt", "*.h5",
"*.msgpack"
])
assert os.path.exists(json_path)
with open(json_path, "r") as f:
config = json.load(f)
config["architectures"] = ["MyOPTForCausalLM"]
with open(json_path, "w") as f:
json.dump(config, f)
return _dummy_path
......@@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
},
])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 8,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 8,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 8,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [
{
"use_v2_block_manager": False,
......@@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
prompts = [
"Hello, my name is",
"The president of the United States is",
("1 + " * 50) + " 1 = ", # Longer prompt.
"The capital of France is",
"The future of AI is",
]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment