Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
...@@ -96,19 +96,7 @@ echo 'vLLM yapf: Done' ...@@ -96,19 +96,7 @@ echo 'vLLM yapf: Done'
# Run mypy # Run mypy
echo 'vLLM mypy:' echo 'vLLM mypy:'
mypy --follow-imports skip # Note that this is less strict than CI tools/mypy.sh
mypy tests --follow-imports skip
mypy vllm/attention --follow-imports skip
mypy vllm/core --follow-imports skip
mypy vllm/distributed --follow-imports skip
mypy vllm/engine --follow-imports skip
mypy vllm/entrypoints --follow-imports skip
mypy vllm/executor --follow-imports skip
mypy vllm/lora --follow-imports skip
mypy vllm/model_executor --follow-imports skip
mypy vllm/prompt_adapter --follow-imports skip
mypy vllm/spec_decode --follow-imports skip
mypy vllm/worker --follow-imports skip
echo 'vLLM mypy: Done' echo 'vLLM mypy: Done'
...@@ -161,7 +149,7 @@ echo 'vLLM codespell: Done' ...@@ -161,7 +149,7 @@ echo 'vLLM codespell: Done'
# Lint specified files # Lint specified files
lint() { lint() {
ruff "$@" ruff check "$@"
} }
# Lint files that differ from main branch. Ignores dirs that are not slated # Lint files that differ from main branch. Ignores dirs that are not slated
...@@ -177,7 +165,7 @@ lint_changed() { ...@@ -177,7 +165,7 @@ lint_changed() {
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
ruff ruff check
fi fi
} }
...@@ -242,6 +230,11 @@ echo 'vLLM isort: Done' ...@@ -242,6 +230,11 @@ echo 'vLLM isort: Done'
# NOTE: Keep up to date with .github/workflows/clang-format.yml # NOTE: Keep up to date with .github/workflows/clang-format.yml
CLANG_FORMAT_EXCLUDES=( CLANG_FORMAT_EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu' 'csrc/moe/topk_softmax_kernels.cu'
'csrc/quantization/gguf/ggml-common.h'
'csrc/quantization/gguf/dequantize.cuh'
'csrc/quantization/gguf/vecdotq.cuh'
'csrc/quantization/gguf/mmq.cuh'
'csrc/quantization/gguf/mmvq.cuh'
) )
# Format specified files with clang-format # Format specified files with clang-format
...@@ -260,7 +253,7 @@ clang_format_changed() { ...@@ -260,7 +253,7 @@ clang_format_changed() {
MERGEBASE="$(git merge-base origin/main HEAD)" MERGEBASE="$(git merge-base origin/main HEAD)"
# Get the list of changed files, excluding the specified ones # Get the list of changed files, excluding the specified ones
changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}")) changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e))
if [ -n "$changed_files" ]; then if [ -n "$changed_files" ]; then
echo "$changed_files" | xargs -P 5 clang-format -i echo "$changed_files" | xargs -P 5 clang-format -i
fi fi
...@@ -283,6 +276,9 @@ else ...@@ -283,6 +276,9 @@ else
fi fi
echo 'vLLM clang-format: Done' echo 'vLLM clang-format: Done'
echo 'vLLM actionlint:'
tools/actionlint.sh -color
echo 'vLLM actionlint: Done'
if ! git diff --quiet &>/dev/null; then if ! git diff --quiet &>/dev/null; then
echo 'Reformatted files. Please review and stage the changes.' echo 'Reformatted files. Please review and stage the changes.'
......
[build-system] [build-system]
# Should be mirrored in requirements-build.txt # Should be mirrored in requirements-build.txt
requires = [ requires = [
"cmake>=3.21", "cmake>=3.26",
"ninja", "ninja",
"packaging", "packaging",
"setuptools >= 49.4.0", "setuptools>=61",
"setuptools-scm>=8.0",
"torch == 2.4.0", "torch == 2.4.0",
"wheel", "wheel",
"jinja2",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[tool.setuptools_scm]
# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
[tool.ruff] [tool.ruff]
# Allow lines to be as long as 80. # Allow lines to be as long as 80.
line-length = 80 line-length = 80
...@@ -18,6 +23,10 @@ exclude = [ ...@@ -18,6 +23,10 @@ exclude = [
"examples/fp8/quantizer/quantize.py" "examples/fp8/quantizer/quantize.py"
] ]
[tool.ruff.lint.per-file-ignores]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [
# pycodestyle # pycodestyle
...@@ -41,6 +50,8 @@ ignore = [ ...@@ -41,6 +50,8 @@ ignore = [
"E731", "E731",
# Loop control variable not used within loop body # Loop control variable not used within loop body
"B007", "B007",
# f-string format
"UP032",
] ]
[tool.mypy] [tool.mypy]
...@@ -56,6 +67,8 @@ files = [ ...@@ -56,6 +67,8 @@ files = [
"vllm/*.py", "vllm/*.py",
"vllm/adapter_commons", "vllm/adapter_commons",
"vllm/assets", "vllm/assets",
"vllm/entrypoints",
"vllm/core",
"vllm/inputs", "vllm/inputs",
"vllm/logging", "vllm/logging",
"vllm/multimodal", "vllm/multimodal",
...@@ -73,7 +86,7 @@ exclude = [ ...@@ -73,7 +86,7 @@ exclude = [
[tool.codespell] [tool.codespell]
ignore-words-list = "dout, te, indicies, subtile" ignore-words-list = "dout, te, indicies, subtile"
skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
[tool.isort] [tool.isort]
use_parentheses = true use_parentheses = true
...@@ -82,5 +95,6 @@ skip_gitignore = true ...@@ -82,5 +95,6 @@ skip_gitignore = true
[tool.pytest.ini_options] [tool.pytest.ini_options]
markers = [ markers = [
"skip_global_cleanup", "skip_global_cleanup",
"vlm: run tests for vision language models only", "core_model: run this model test in each PR instead of just daily",
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
] ]
# enable python only development
# copy compiled files to the current directory directly
import argparse
import os
import shutil
import subprocess
import sys
import warnings
parser = argparse.ArgumentParser(
description="Development mode for python-only code")
parser.add_argument('-q',
'--quit-dev',
action='store_true',
help='Set the flag to quit development mode')
args = parser.parse_args()
# cannot directly `import vllm` , because it will try to
# import from the current directory
output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"],
capture_output=True)
assert output.returncode == 0, "vllm is not installed"
text = output.stdout.decode("utf-8")
package_path = None
for line in text.split("\n"):
if line.startswith("Location: "):
package_path = line.split(": ")[1]
break
assert package_path is not None, "could not find package path"
cwd = os.getcwd()
assert cwd != package_path, "should not import from the current directory"
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_core_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
"vllm/vllm_flash_attn/__init__.py",
# "vllm/_version.py", # not available in nightly wheels yet
]
# Try to create _version.py to avoid version related warning
# Refer to https://github.com/vllm-project/vllm/pull/8771
try:
from setuptools_scm import get_version
get_version(write_to="vllm/_version.py")
except ImportError:
warnings.warn(
"To avoid warnings related to vllm._version, "
"you should install setuptools-scm by `pip install setuptools-scm`",
stacklevel=2)
if not args.quit_dev:
for file in files_to_copy:
src = os.path.join(package_path, file)
dst = file
print(f"Copying {src} to {dst}")
shutil.copyfile(src, dst)
pre_built_vllm_path = os.path.join(package_path, "vllm")
tmp_path = os.path.join(package_path, "vllm_pre_built")
current_vllm_path = os.path.join(cwd, "vllm")
print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup")
os.rename(pre_built_vllm_path, tmp_path)
print(f"Linking {current_vllm_path} to {pre_built_vllm_path}")
os.symlink(current_vllm_path, pre_built_vllm_path)
else:
vllm_symlink_path = os.path.join(package_path, "vllm")
vllm_backup_path = os.path.join(package_path, "vllm_pre_built")
current_vllm_path = os.path.join(cwd, "vllm")
print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}")
assert os.path.islink(
vllm_symlink_path
), f"not in dev mode: {vllm_symlink_path} is not a symbolic link"
assert current_vllm_path == os.readlink(
vllm_symlink_path
), "current directory is not the source code of package"
os.unlink(vllm_symlink_path)
print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}")
os.rename(vllm_backup_path, vllm_symlink_path)
# Dependencies for Ray accelerated DAG
cupy-cuda12x
ray >= 2.32
\ No newline at end of file
# Should be mirrored in pyproject.toml # Should be mirrored in pyproject.toml
cmake>=3.21 cmake>=3.26
ninja ninja
packaging packaging
setuptools>=49.4.0 setuptools>=61
setuptools-scm>=8
torch==2.4.0 torch==2.4.0
wheel wheel
jinja2
cmake >= 3.21
ninja # For faster builds.
psutil psutil
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0 numpy < 2.0.0
requests requests >= 2.26.0
tqdm tqdm
py-cpuinfo py-cpuinfo
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox. transformers >= 4.45.2 # Required for Llama 3.2 and Qwen2-VL.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.19.1 # Required for Llama 3.
fastapi protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp aiohttp
openai openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
uvicorn[standard] uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server. pydantic >= 2.9 # Required for fastapi >= 0.113.0
pillow # Required for image processing pillow # Required for image processing
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3 lm-format-enforcer == 0.10.6
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 outlines >= 0.0.43, < 0.1
typing_extensions typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
partial-json-parser # used for parsing partial JSON outputs
pyzmq pyzmq
msgspec
gguf == 0.10.0
importlib_metadata
mistral_common[opencv] >= 1.4.4
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.6.0 # required for compressed-tensors
...@@ -7,5 +7,4 @@ nvidia-ml-py # for pynvml package ...@@ -7,5 +7,4 @@ nvidia-ml-py # for pynvml package
torch == 2.4.0 torch == 2.4.0
# These must be updated alongside torch # These must be updated alongside torch
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27.post2 # Requires PyTorch 2.4.0 xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1 # Requires PyTorch 2.4.0
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
yapf==0.32.0 yapf==0.32.0
toml==0.10.2 toml==0.10.2
tomli==2.0.1 tomli==2.0.1
ruff==0.1.5 ruff==0.6.5
codespell==2.3.0 codespell==2.3.0
isort==5.13.2 isort==5.13.2
clang-format==18.1.5 clang-format==18.1.5
# type checking # type checking
mypy==1.9.0 mypy==1.11.1
types-PyYAML types-PyYAML
types-requests types-requests
types-setuptools types-setuptools
# Mamba dependencies
mamba-ssm>=1.2.2
causal-conv1d>=1.2.0
...@@ -2,6 +2,6 @@ ...@@ -2,6 +2,6 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for Neuron devices # Dependencies for Neuron devices
transformers-neuronx >= 0.9.0 transformers-neuronx >= 0.12.0
torch-neuronx >= 2.1.0 torch-neuronx >= 2.1.2
neuronx-cc neuronx-cc
# Common dependencies # Common dependencies
# -r requirements-common.txt -r requirements-common.txt
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
cmake >= 3.21
ninja # For faster builds.
psutil
sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers < 4.43
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.3
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
# OpenVINO dependencies torch == 2.4.0 # should be aligned with "common" vLLM torch version
torch >= 2.1.2 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
openvino ~= 2024.3.0.dev
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
optimum-intel[openvino] >= 1.18.1 optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
...@@ -8,3 +8,11 @@ botocore ...@@ -8,3 +8,11 @@ botocore
ray >= 2.10.0 ray >= 2.10.0
peft peft
pytest-asyncio pytest-asyncio
tensorizer>=2.9.0
setuptools_scm>=8
torch == 2.3.0
triton == 2.1.0
flash_attn == 2.6.1
xformers == 0.0.25
lmslim == 0.1.2
\ No newline at end of file
# Needed for Ray accelerated DAG tests
-r requirements-adag.txt
# testing # testing
pytest pytest
tensorizer>=2.9.0 tensorizer>=2.9.0
...@@ -11,17 +8,27 @@ pytest-shard ...@@ -11,17 +8,27 @@ pytest-shard
# testing utils # testing utils
awscli awscli
einops # required for MPT einops # required for MPT, qwen-vl and Mamba
httpx httpx
librosa # required for audio tests
opencv-python # required for video tests
peft peft
requests requests
ray ray[adag]==2.35
sentence-transformers # required for embedding sentence-transformers # required for embedding
compressed-tensors==0.4.0 # required for compressed-tensors soundfile # required for audio test
timm # required for internvl test timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
# Benchmarking # Benchmarking
aiohttp aiohttp
# quantization # quantization
bitsandbytes==0.42.0 bitsandbytes>=0.44.0
\ No newline at end of file buildkite-test-collector==0.1.8
...@@ -4,4 +4,4 @@ ...@@ -4,4 +4,4 @@
# Dependencies for TPU # Dependencies for TPU
# Currently, the TPU backend uses a nightly version of PyTorch XLA. # Currently, the TPU backend uses a nightly version of PyTorch XLA.
# You can install the dependencies in Dockerfile.tpu. # You can install the dependencies in Dockerfile.tpu.
ray ray[default]
# Common dependencies # Common dependencies
-r requirements-common.txt -r requirements-common.txt
setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed. ray >= 2.9
cmake>=3.26
torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl ninja
intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl packaging
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl setuptools-scm>=8
wheel
triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl jinja2
# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
torch == 2.3.1+cxx11.abi
intel-extension-for-pytorch == 2.3.110+xpu
oneccl_bind_pt == 2.3.100+xpu
triton-xpu == 3.0.0b2
...@@ -5,7 +5,7 @@ import os ...@@ -5,7 +5,7 @@ import os
import re import re
import subprocess import subprocess
import sys import sys
import warnings from pathlib import Path
from shutil import which from shutil import which
from typing import Dict, List from typing import Dict, List
...@@ -13,6 +13,7 @@ import torch ...@@ -13,6 +13,7 @@ import torch
from packaging.version import Version, parse from packaging.version import Version, parse
from setuptools import Extension, find_packages, setup from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext
from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CUDA_HOME
from typing import Optional, Union from typing import Optional, Union
...@@ -34,43 +35,18 @@ def load_module_from_path(module_name, path): ...@@ -34,43 +35,18 @@ def load_module_from_path(module_name, path):
ROOT_DIR = os.path.dirname(__file__) ROOT_DIR = os.path.dirname(__file__)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def embed_commit_hash():
try:
if "BUILDKITE_COMMIT" in os.environ:
# ci build
commit_id = os.environ["BUILDKITE_COMMIT"]
else:
commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
encoding="utf-8").strip()
commit_contents = f'__commit__ = "{commit_id}"\n'
version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
with open(version_file, "w", encoding="utf-8") as f:
f.write(commit_contents)
except subprocess.CalledProcessError as e:
warnings.warn(f"Failed to get commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
except Exception as e:
warnings.warn(f"Failed to embed commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
embed_commit_hash()
# cannot import envs directly because it depends on vllm, # cannot import envs directly because it depends on vllm,
# which is not installed yet # which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
# vLLM only supports Linux platform if not sys.platform.startswith("linux"):
assert sys.platform.startswith( logger.warning(
"linux"), "vLLM only supports Linux platform (including WSL)." "vLLM only supports Linux platform (including WSL). "
"Building on %s, "
"so vLLM may not be able to run correctly", sys.platform)
VLLM_TARGET_DEVICE = "empty"
MAIN_CUDA_VERSION = "12.1" MAIN_CUDA_VERSION = "12.1"
...@@ -156,15 +132,8 @@ class cmake_build_ext(build_ext): ...@@ -156,15 +132,8 @@ class cmake_build_ext(build_ext):
default_cfg = "Debug" if self.debug else "RelWithDebInfo" default_cfg = "Debug" if self.debug else "RelWithDebInfo"
cfg = envs.CMAKE_BUILD_TYPE or default_cfg cfg = envs.CMAKE_BUILD_TYPE or default_cfg
# where .so files will be written, should be the same for all extensions
# that use the same CMakeLists.txt.
outdir = os.path.abspath(
os.path.dirname(self.get_ext_fullpath(ext.name)))
cmake_args = [ cmake_args = [
'-DCMAKE_BUILD_TYPE={}'.format(cfg), '-DCMAKE_BUILD_TYPE={}'.format(cfg),
'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
'-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE), '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
] ]
...@@ -174,20 +143,27 @@ class cmake_build_ext(build_ext): ...@@ -174,20 +143,27 @@ class cmake_build_ext(build_ext):
if is_sccache_available(): if is_sccache_available():
cmake_args += [ cmake_args += [
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
'-DCMAKE_C_COMPILER_LAUNCHER=sccache', '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
] ]
elif is_ccache_available(): elif is_ccache_available():
cmake_args += [ cmake_args += [
'-DCMAKE_C_COMPILER_LAUNCHER=ccache',
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
'-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
] ]
# Pass the python executable to cmake so it can find an exact # Pass the python executable to cmake so it can find an exact
# match. # match.
cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)] cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
# Pass the python path to cmake so it can reuse the build dependencies
# on subsequent calls to python.
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
# #
# Setup parallelism and build tool # Setup parallelism and build tool
# #
...@@ -221,10 +197,12 @@ class cmake_build_ext(build_ext): ...@@ -221,10 +197,12 @@ class cmake_build_ext(build_ext):
os.makedirs(self.build_temp) os.makedirs(self.build_temp)
targets = [] targets = []
target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
"vllm_flash_attn.")
# Build all the extensions # Build all the extensions
for ext in self.extensions: for ext in self.extensions:
self.configure(ext) self.configure(ext)
targets.append(remove_prefix(ext.name, "vllm.")) targets.append(target_name(ext.name))
num_jobs, _ = self.compute_num_jobs() num_jobs, _ = self.compute_num_jobs()
...@@ -237,6 +215,47 @@ class cmake_build_ext(build_ext): ...@@ -237,6 +215,47 @@ class cmake_build_ext(build_ext):
subprocess.check_call(["cmake", *build_args], cwd=self.build_temp) subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
# Install the libraries
for ext in self.extensions:
# Install the extension into the proper location
outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
# Skip if the install directory is the same as the build directory
if outdir == self.build_temp:
continue
# CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it.
prefix = outdir
for i in range(ext.name.count('.')):
prefix = prefix.parent
# prefix here should actually be the same for all components
install_args = [
"cmake", "--install", ".", "--prefix", prefix, "--component",
target_name(ext.name)
]
subprocess.check_call(install_args, cwd=self.build_temp)
def run(self):
# First, run the standard build_ext command to compile the extensions
super().run()
# copy vllm/vllm_flash_attn/*.py from self.build_lib to current
# directory so that they can be included in the editable build
import glob
files = glob.glob(
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
for file in files:
dst_file = os.path.join("vllm/vllm_flash_attn",
os.path.basename(file))
print(f"Copying {file} to {dst_file}")
self.copy_file(file, dst_file)
def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty"
def _is_cuda() -> bool: def _is_cuda() -> bool:
has_cuda = torch.version.cuda is not None has_cuda = torch.version.cuda is not None
...@@ -279,7 +298,7 @@ def _build_custom_ops() -> bool: ...@@ -279,7 +298,7 @@ def _build_custom_ops() -> bool:
def _build_core_ext() -> bool: def _build_core_ext() -> bool:
return not _is_neuron() and not _is_tpu() return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
def get_hipcc_rocm_version(): def get_hipcc_rocm_version():
...@@ -320,7 +339,7 @@ def get_neuronxcc_version(): ...@@ -320,7 +339,7 @@ def get_neuronxcc_version():
# Return the version string # Return the version string
return match.group(1) return match.group(1)
else: else:
raise RuntimeError("Could not find HIP version in the output") raise RuntimeError("Could not find Neuron version in the output")
def get_nvcc_cuda_version() -> Version: def get_nvcc_cuda_version() -> Version:
...@@ -341,19 +360,6 @@ def get_path(*filepath) -> str: ...@@ -341,19 +360,6 @@ def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath) return os.path.join(ROOT_DIR, *filepath)
def find_version(filepath: str) -> str:
"""Extract version information from the given filepath.
Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
"""
with open(filepath) as fp:
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
fp.read(), re.M)
if version_match:
return version_match.group(1)
raise RuntimeError("Unable to find version string.")
def get_sha(root: Union[str, Path]) -> str: def get_sha(root: Union[str, Path]) -> str:
try: try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip() return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
...@@ -364,13 +370,21 @@ def get_sha(root: Union[str, Path]) -> str: ...@@ -364,13 +370,21 @@ def get_sha(root: Union[str, Path]) -> str:
def get_version_add(sha: Optional[str] = None) -> str: def get_version_add(sha: Optional[str] = None) -> str:
vllm_root = os.path.dirname(os.path.abspath(__file__)) vllm_root = os.path.dirname(os.path.abspath(__file__))
add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py") add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py")
major, minor, _ = torch.__version__.split('.')
if add_git_version: if add_git_version:
if sha != 'Unknown': if sha != 'Unknown':
if sha is None: if sha is None:
sha = get_sha(vllm_root) sha = get_sha(vllm_root)
version = 'das.opt1' + sha[:7] # if (major, minor) == ('2', '1'):
# version = 'das.opt1.' + sha[:7]
if (major, minor) == ('2', '3'):
version = 'das.opt1.' + sha[:7]
else: else:
version = 'das.opt1' # if (major, minor) == ('2', '1'):
# version = 'das.opt1'
if (major, minor) == ('2', '3'):
version = 'das.opt1'
# dtk version # dtk version
if os.getenv("ROCM_PATH"): if os.getenv("ROCM_PATH"):
...@@ -382,20 +396,20 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -382,20 +396,20 @@ def get_version_add(sha: Optional[str] = None) -> str:
version += ".dtk" + rocm_version version += ".dtk" + rocm_version
new_version_content = f""" new_version_content = f"""
import warnings
try: try:
import vllm.commit_id __version__ = "0.6.3.post1"
__commit__ = vllm.commit_id.__commit__ __version_tuple__ = (0, 6, 3)
__dcu_version__ = f'0.6.3.post1+{version}'
from vllm.version import __version__, __version_tuple__, __dcu_version__
except Exception as e: except Exception as e:
import warnings
warnings.warn(f"Failed to read commit hash:\\n + str(e)", warnings.warn(f"Failed to read commit hash:\\n + str(e)",
RuntimeWarning, RuntimeWarning,
stacklevel=2) stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER" __version__ = "dev"
__version_tuple__ = (0, 0, __version__)
__version__ = "0.5.4"
__dcu_version__ = f'0.5.4+{version}'
""" """
with open(add_version_path, encoding="utf-8",mode="w") as file: with open(add_version_path, encoding="utf-8",mode="w") as file:
...@@ -412,34 +426,44 @@ def get_version(): ...@@ -412,34 +426,44 @@ def get_version():
def get_vllm_version() -> str: def get_vllm_version() -> str:
# version = find_version(get_path("vllm", "version.py")) if not _is_hip():
version = get_version(
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
)
if _is_cuda(): sep = "+" if "+" not in version else "." # dev versions might contain +
if _no_device():
if envs.VLLM_TARGET_DEVICE == "empty":
version += f"{sep}empty"
elif _is_cuda():
cuda_version = str(get_nvcc_cuda_version()) cuda_version = str(get_nvcc_cuda_version())
if cuda_version != MAIN_CUDA_VERSION: if cuda_version != MAIN_CUDA_VERSION:
cuda_version_str = cuda_version.replace(".", "")[:3] cuda_version_str = cuda_version.replace(".", "")[:3]
version += f"+cu{cuda_version_str}" # skip this for source tarball, required for pypi
if "sdist" not in sys.argv:
version += f"{sep}cu{cuda_version_str}"
elif _is_hip(): elif _is_hip():
# Get the HIP version # Get the HIP version
# hipcc_version = get_hipcc_rocm_version() # hipcc_version = get_hipcc_rocm_version()
# if hipcc_version != MAIN_CUDA_VERSION: # if hipcc_version != MAIN_CUDA_VERSION:
# rocm_version_str = hipcc_version.replace(".", "")[:3] # rocm_version_str = hipcc_version.replace(".", "")[:3]
# version += f"+rocm{rocm_version_str}" # version += f"{sep}rocm{rocm_version_str}"
version = get_version() version = get_version()
elif _is_neuron(): elif _is_neuron():
# Get the Neuron version # Get the Neuron version
neuron_version = str(get_neuronxcc_version()) neuron_version = str(get_neuronxcc_version())
if neuron_version != MAIN_CUDA_VERSION: if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3] neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}" version += f"{sep}neuron{neuron_version_str}"
elif _is_openvino(): elif _is_openvino():
version += "+openvino" version += f"{sep}openvino"
elif _is_tpu(): elif _is_tpu():
version += "+tpu" version += f"{sep}tpu"
elif _is_cpu(): elif _is_cpu():
version += "+cpu" version += f"{sep}cpu"
elif _is_xpu(): elif _is_xpu():
version += "+xpu" version += f"{sep}xpu"
else: else:
raise RuntimeError("Unknown runtime environment") raise RuntimeError("Unknown runtime environment")
...@@ -465,11 +489,15 @@ def get_requirements() -> List[str]: ...@@ -465,11 +489,15 @@ def get_requirements() -> List[str]:
for line in requirements: for line in requirements:
if line.startswith("-r "): if line.startswith("-r "):
resolved_requirements += _read_requirements(line.split()[1]) resolved_requirements += _read_requirements(line.split()[1])
elif line.startswith("--"):
continue
else: else:
resolved_requirements.append(line) resolved_requirements.append(line)
return resolved_requirements return resolved_requirements
if _is_cuda(): if _no_device():
requirements = _read_requirements("requirements-cuda.txt")
elif _is_cuda():
requirements = _read_requirements("requirements-cuda.txt") requirements = _read_requirements("requirements-cuda.txt")
cuda_major, cuda_minor = torch.version.cuda.split(".") cuda_major, cuda_minor = torch.version.cuda.split(".")
modified_requirements = [] modified_requirements = []
...@@ -508,16 +536,26 @@ if _build_core_ext(): ...@@ -508,16 +536,26 @@ if _build_core_ext():
if _is_cuda() or _is_hip(): if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C")) ext_modules.append(CMakeExtension(name="vllm._moe_C"))
# if _is_hip():
# ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
if _is_cuda():
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
if _build_custom_ops(): if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
package_data = { package_data = {
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py"]
} }
if envs.VLLM_USE_PRECOMPILED: if envs.VLLM_USE_PRECOMPILED:
ext_modules = [] ext_modules = []
package_data["vllm"].append("*.so") package_data["vllm"].append("*.so")
if _no_device():
ext_modules = []
setup( setup(
name="vllm", name="vllm",
version=get_vllm_version(), version=get_vllm_version(),
...@@ -539,7 +577,11 @@ setup( ...@@ -539,7 +577,11 @@ setup(
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.12",
"License :: OSI Approved :: Apache Software License", "License :: OSI Approved :: Apache Software License",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
], ],
packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
"tests*")), "tests*")),
...@@ -548,6 +590,7 @@ setup( ...@@ -548,6 +590,7 @@ setup(
ext_modules=ext_modules, ext_modules=ext_modules,
extras_require={ extras_require={
"tensorizer": ["tensorizer>=2.9.0"], "tensorizer": ["tensorizer>=2.9.0"],
"audio": ["librosa", "soundfile"] # Required for audio processing
}, },
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {}, cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
package_data=package_data, package_data=package_data,
......
"""vllm.entrypoints.api_server with some extra logging for testing.""" """vllm.entrypoints.api_server with some extra logging for testing."""
from typing import Any, Dict from typing import Any, Dict, Iterable
import uvicorn import uvicorn
from fastapi.responses import JSONResponse, Response from fastapi.responses import JSONResponse, Response
...@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine): ...@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self._num_aborts = 0 self._num_aborts = 0
async def abort(self, request_id: str) -> None: async def _engine_abort(self, request_ids: Iterable[str]):
await super().abort(request_id) ids = list(request_ids)
self._num_aborts += 1 self._num_aborts += len(ids)
await super()._engine_abort(ids)
def testing_stats(self) -> Dict[str, Any]: def testing_stats(self) -> Dict[str, Any]:
return {"num_aborted_requests": self._num_aborts} return {"num_aborted_requests": self._num_aborts}
......
...@@ -25,8 +25,7 @@ def _query_server_long(prompt: str) -> dict: ...@@ -25,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture @pytest.fixture
def api_server(tokenizer_pool_size: int, engine_use_ray: bool, def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
worker_use_ray: bool):
script_path = Path(__file__).parent.joinpath( script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute() "api_server_async_engine.py").absolute()
commands = [ commands = [
...@@ -35,8 +34,7 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool, ...@@ -35,8 +34,7 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
"127.0.0.1", "--tokenizer-pool-size", "127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size) str(tokenizer_pool_size)
] ]
if engine_use_ray:
commands.append("--engine-use-ray")
if worker_use_ray: if worker_use_ray:
commands.append("--worker-use-ray") commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands) uvicorn_process = subprocess.Popen(commands)
...@@ -46,9 +44,8 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool, ...@@ -46,9 +44,8 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True]) @pytest.mark.parametrize("worker_use_ray", [False, True])
@pytest.mark.parametrize("engine_use_ray", [False, True]) def test_api_server(api_server, tokenizer_pool_size: int,
def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool, worker_use_ray: bool):
engine_use_ray: bool):
""" """
Run the API server and test it. Run the API server and test it.
......
import asyncio import asyncio
import os
import uuid
from asyncio import CancelledError
from copy import copy
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional
import pytest import pytest
import pytest_asyncio
import torch import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from vllm.outputs import RequestOutput as RealRequestOutput
from vllm.sampling_params import RequestOutputKind
from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear from ..utils import wait_for_gpu_memory_to_clear
...@@ -17,6 +26,11 @@ class RequestOutput: ...@@ -17,6 +26,11 @@ class RequestOutput:
finished: bool = False finished: bool = False
@dataclass
class MockModelConfig:
use_async_output_proc = True
class MockEngine: class MockEngine:
def __init__(self): def __init__(self):
...@@ -26,6 +40,7 @@ class MockEngine: ...@@ -26,6 +40,7 @@ class MockEngine:
self.request_id = None self.request_id = None
# Ugly, remove dependency when possible # Ugly, remove dependency when possible
self.parallel_config = ParallelConfig(1, 1, False) self.parallel_config = ParallelConfig(1, 1, False)
self.model_config = MockModelConfig()
async def step_async(self, virtual_engine): async def step_async(self, virtual_engine):
# PP size is 1, ignore virtual engine # PP size is 1, ignore virtual engine
...@@ -66,24 +81,24 @@ class MockEngine: ...@@ -66,24 +81,24 @@ class MockEngine:
class MockAsyncLLMEngine(AsyncLLMEngine): class MockAsyncLLMEngine(AsyncLLMEngine):
_engine_class = MockEngine
def _init_engine(self, *args, **kwargs):
return MockEngine()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_new_requests_event(): async def test_new_requests_event():
engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False) params = SamplingParams()
engine = MockAsyncLLMEngine()
engine.start_background_loop() engine.start_background_loop()
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
assert engine.engine.step_calls == 0 assert engine.engine.step_calls == 0
await engine.add_request("1", "", None) await engine.add_request("1", "", params)
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
assert engine.engine.add_request_calls == 1 assert engine.engine.add_request_calls == 1
assert engine.engine.step_calls == 1 assert engine.engine.step_calls == 1
await engine.add_request("2", "", None) await engine.add_request("2", "", params)
engine.engine.generate("2") engine.engine.generate("2")
await asyncio.sleep(0) await asyncio.sleep(0)
await asyncio.sleep(0) await asyncio.sleep(0)
...@@ -98,7 +113,7 @@ async def test_new_requests_event(): ...@@ -98,7 +113,7 @@ async def test_new_requests_event():
await asyncio.sleep(0.001) await asyncio.sleep(0.001)
assert engine.engine.step_calls == old_step_calls assert engine.engine.step_calls == old_step_calls
await engine.add_request("3", "", None) await engine.add_request("3", "", params)
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
assert engine.engine.add_request_calls == 3 assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == old_step_calls + 1 assert engine.engine.step_calls == old_step_calls + 1
...@@ -106,39 +121,254 @@ async def test_new_requests_event(): ...@@ -106,39 +121,254 @@ async def test_new_requests_event():
assert engine.engine.add_request_calls == 3 assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == old_step_calls + 1 assert engine.engine.step_calls == old_step_calls + 1
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) engine = MockAsyncLLMEngine()
assert engine.get_model_config() is not None assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None assert engine.get_decoding_config() is not None
def test_asyncio_run(): def start_engine():
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
devices=list(range(torch.cuda.device_count())), devices=list(range(torch.cuda.device_count())),
threshold_bytes=2 * 2**30, threshold_bytes=2 * 2**30,
timeout_s=60, timeout_s=60,
) )
engine = AsyncLLMEngine.from_engine_args( num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
AsyncEngineArgs(model="facebook/opt-125m")) print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
return AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m",
enforce_eager=True,
num_scheduler_steps=num_scheduler_steps))
def uid() -> str:
return str(uuid.uuid4())
@pytest_asyncio.fixture(scope="module")
async def async_engine():
engine = await asyncio.get_event_loop().run_in_executor(executor=None,
func=start_engine)
try:
yield engine
finally:
engine.shutdown_background_loop()
del engine
await asyncio.sleep(0.1)
cleanup()
@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
# So we can share the async engine fixture between these tests
return False
@pytest.mark.asyncio(scope="module")
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
async def test_asyncio_run(async_engine, stop):
scheduler_config = await async_engine.get_scheduler_config()
num_scheduler_steps = scheduler_config.num_scheduler_steps
async def run(prompt: str): async def run(prompt: str):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
max_tokens=32, max_tokens=32,
min_tokens=32,
stop=stop,
) )
async for output in engine.generate(prompt, output_count = 0
sampling_params, final_output = None
request_id=prompt): async for output in async_engine.generate(prompt,
sampling_params,
request_id=uid()):
output_count += 1
final_output = output final_output = output
return final_output return final_output, output_count
async def generate(): results = await asyncio.gather(
return await asyncio.gather( run("test0"),
run("test0"), run("test0"),
run("test1"), )
)
results = asyncio.run(generate())
assert len(results) == 2 assert len(results) == 2
first, second = results
# remove nondeterministic fields for comparison
first[0].metrics = None
second[0].metrics = None
first[0].request_id = None
second[0].request_id = None
assert str(first) == str(second)
output_count = results[0][1]
if num_scheduler_steps == 1:
assert output_count == 32
else:
assert 1 < output_count < 32
@pytest.mark.asyncio(scope="module")
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
async def test_output_kinds(async_engine, stop):
"""Test that output_kind works as expected and that
results are equivalent across different kinds."""
scheduler_config = await async_engine.get_scheduler_config()
num_scheduler_steps = scheduler_config.num_scheduler_steps
sampling_params = SamplingParams(
temperature=0,
max_tokens=32,
min_tokens=32,
stop=stop,
)
async def run(prompt: str, kind: RequestOutputKind):
params = copy(sampling_params)
params.output_kind = kind
output_count = 0
final_output = None
async for output in async_engine.generate(prompt,
params,
request_id=uid()):
output_count += 1
final_output = output
assert final_output is not None
assert final_output.finished
return (final_output.prompt_token_ids,
final_output.outputs[0].token_ids,
final_output.outputs[0].text, output_count)
async def run_deltas(prompt: str):
params = copy(sampling_params)
params.output_kind = RequestOutputKind.DELTA
prompt_tokens = None
output_tokens: List[int] = []
output_text = ""
output_count = 0
final_output = None
async for output in async_engine.generate(prompt,
params,
request_id=uid()):
token_ids = output.outputs[0].token_ids
text = output.outputs[0].text
final_output = output
# Ensure we get prompt ids iff we haven't yet received output tokens
if output_tokens:
assert 1 <= len(token_ids) <= num_scheduler_steps
assert stop or text
assert not output.prompt_token_ids
else:
assert output.prompt_token_ids
prompt_tokens = output.prompt_token_ids
output_tokens.extend(token_ids)
output_text += text
output_count += 1
assert final_output is not None
assert final_output.finished
return prompt_tokens, output_tokens, output_text, output_count
results = await asyncio.gather(
run("common input prompt", RequestOutputKind.CUMULATIVE),
run("common input prompt", RequestOutputKind.FINAL_ONLY),
run_deltas("common input prompt"))
# Make sure outputs are the same
prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
assert len(prompt_set) == 1
text_set = set(text for _, _, text, _ in results)
assert len(text_set) == 1
tokens_set = set(tuple(ids) for _, ids, _, _ in results)
assert len(tokens_set) == 1
cumulative, final, deltas = results
# output message counts
assert cumulative[3] == deltas[3]
if num_scheduler_steps == 1:
assert cumulative[3] == 32
else:
assert 1 < cumulative[3] < 32
assert final[3] == 1
@pytest.mark.asyncio(scope="module")
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
async def test_cancellation(async_engine, stop):
scheduler_config = await async_engine.get_scheduler_config()
num_scheduler_steps = scheduler_config.num_scheduler_steps
sampling_params = SamplingParams(
temperature=0,
min_tokens=13,
max_tokens=13,
stop=stop,
)
stop_at = 5 if num_scheduler_steps == 1 else 1
request_id = uid()
i = 0
with pytest.raises(CancelledError):
async for output in async_engine.generate("test2",
sampling_params,
request_id=request_id):
assert not output.finished
i += 1
if i == stop_at:
await async_engine.abort(request_id)
assert i == stop_at
@pytest.mark.asyncio(scope="module")
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
async def test_delayed_generator(async_engine, stop):
scheduler_config = await async_engine.get_scheduler_config()
if scheduler_config.num_scheduler_steps != 1:
pytest.skip("no need to test this one with multistep")
sampling_params = SamplingParams(
temperature=0,
min_tokens=10,
max_tokens=10,
stop=stop,
)
stream = async_engine.generate("test3", sampling_params, request_id=uid())
i = 0
final_output: Optional[RealRequestOutput] = None
async for output in stream:
final_output = output
if i == 0:
# wait for generation to complete before consuming
# the remaining messages
await asyncio.sleep(1)
if i < 9:
assert not output.finished
i += 1
assert i == 10
assert final_output is not None
assert len(final_output.outputs[0].token_ids) == 10
assert final_output.finished
import openai # use the official client for correctness check
import pytest
from ..utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--enforce-eager",
"--engine-use-ray"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client(server):
return server.get_async_client()
@pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI):
models = await client.models.list()
models = models.data
served_model = models[0]
assert served_model.id == MODEL_NAME
assert all(model.root == MODEL_NAME for model in models)
@pytest.mark.asyncio
async def test_single_completion(client: openai.AsyncOpenAI):
completion = await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert len(completion.choices) == 1
assert len(completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 5
@pytest.mark.asyncio
async def test_single_chat_session(client: openai.AsyncOpenAI):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test single completion
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=13, total_tokens=23)
message = choice.message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment