Commit 539aa992 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.2' into v0.6.2-dev

parents 93872128 7193774b
......@@ -39,6 +39,33 @@ outputs = llm.chat(conversation,
use_tqdm=False)
print_outputs(outputs)
# You can run batch inference with llm.chat API
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
conversations = [conversation for _ in range(10)]
# We turn on tqdm progress bar to verify it's indeed running batch inference
outputs = llm.chat(messages=conversations,
sampling_params=sampling_params,
use_tqdm=True)
print_outputs(outputs)
# A chat template can be optionally supplied.
# If not, the model will use its default chat template.
......
......@@ -14,7 +14,8 @@ from vllm.utils import FlexibleArgumentParser
# LLaVA-1.5
def run_llava(question):
def run_llava(question, modality):
assert modality == "image"
prompt = f"USER: <image>\n{question}\nASSISTANT:"
......@@ -24,7 +25,8 @@ def run_llava(question):
# LLaVA-1.6/LLaVA-NeXT
def run_llava_next(question):
def run_llava_next(question, modality):
assert modality == "image"
prompt = f"[INST] <image>\n{question} [/INST]"
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
......@@ -34,15 +36,35 @@ def run_llava_next(question):
# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(question):
def run_llava_next_video(question, modality):
assert modality == "video"
prompt = f"USER: <video>\n{question} ASSISTANT:"
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
stop_token_ids = None
return llm, prompt, stop_token_ids
# LLaVA-OneVision
def run_llava_onevision(question, modality):
if modality == "video":
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
<|im_start|>assistant\n"
elif modality == "image":
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=32768)
stop_token_ids = None
return llm, prompt, stop_token_ids
# Fuyu
def run_fuyu(question):
def run_fuyu(question, modality):
assert modality == "image"
prompt = f"{question}\n"
llm = LLM(model="adept/fuyu-8b")
......@@ -51,7 +73,8 @@ def run_fuyu(question):
# Phi-3-Vision
def run_phi3v(question):
def run_phi3v(question, modality):
assert modality == "image"
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
# Note: The default setting of max_num_seqs (256) and
......@@ -60,17 +83,32 @@ def run_phi3v(question):
# In this example, we override max_num_seqs to 5 while
# keeping the original context length of 128k.
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance, because it may scale the image more in
# the image preprocessing. Some references in the model docs and the
# formula for image tokens after the preprocessing
# transform can be found below.
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm = LLM(
model="microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True,
max_num_seqs=5,
mm_processor_kwargs={"num_crops": 16},
)
stop_token_ids = None
return llm, prompt, stop_token_ids
# PaliGemma
def run_paligemma(question):
def run_paligemma(question, modality):
assert modality == "image"
# PaliGemma has special prompt format for VQA
prompt = "caption en"
......@@ -80,7 +118,8 @@ def run_paligemma(question):
# Chameleon
def run_chameleon(question):
def run_chameleon(question, modality):
assert modality == "image"
prompt = f"{question}<image>"
llm = LLM(model="facebook/chameleon-7b")
......@@ -89,7 +128,8 @@ def run_chameleon(question):
# MiniCPM-V
def run_minicpmv(question):
def run_minicpmv(question, modality):
assert modality == "image"
# 2.0
# The official repo doesn't work yet, so we need to use a fork for now
......@@ -129,7 +169,9 @@ def run_minicpmv(question):
# InternVL
def run_internvl(question):
def run_internvl(question, modality):
assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B"
llm = LLM(
......@@ -155,7 +197,8 @@ def run_internvl(question):
# BLIP-2
def run_blip2(question):
def run_blip2(question, modality):
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
......@@ -166,7 +209,8 @@ def run_blip2(question):
# Qwen
def run_qwen_vl(question):
def run_qwen_vl(question, modality):
assert modality == "image"
llm = LLM(
model="Qwen/Qwen-VL",
......@@ -180,7 +224,9 @@ def run_qwen_vl(question):
# Qwen2-VL
def run_qwen2_vl(question):
def run_qwen2_vl(question, modality):
assert modality == "image"
model_name = "Qwen/Qwen2-VL-7B-Instruct"
llm = LLM(
......@@ -196,10 +242,34 @@ def run_qwen2_vl(question):
return llm, prompt, stop_token_ids
# LLama
def run_mllama(question, modality):
assert modality == "image"
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# Note: The default setting of max_num_seqs (256) and
# max_model_len (131072) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.
# The configuration below has been confirmed to launch on a
# single H100 GPU.
llm = LLM(
model=model_name,
max_num_seqs=16,
enforce_eager=True,
)
prompt = f"<|image|><|begin_of_text|>{question}"
stop_token_ids = None
return llm, prompt, stop_token_ids
model_example_map = {
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
"llava-onevision": run_llava_onevision,
"fuyu": run_fuyu,
"phi3_v": run_phi3v,
"paligemma": run_paligemma,
......@@ -209,6 +279,7 @@ model_example_map = {
"internvl_chat": run_internvl,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"mllama": run_mllama,
}
......@@ -255,7 +326,7 @@ def main(args):
data = mm_input["data"]
question = mm_input["question"]
llm, prompt, stop_token_ids = model_example_map[model](question)
llm, prompt, stop_token_ids = model_example_map[model](question, modality)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
......@@ -306,6 +377,7 @@ if __name__ == "__main__":
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
......
......@@ -4,8 +4,9 @@ multi-image input on vision language models, using the chat template defined
by the model.
"""
from argparse import Namespace
from typing import List
from typing import List, NamedTuple, Optional
from PIL.Image import Image
from transformers import AutoProcessor, AutoTokenizer
from vllm import LLM, SamplingParams
......@@ -19,7 +20,15 @@ IMAGE_URLS = [
]
def load_qwenvl_chat(question: str, image_urls: List[str]):
class ModelRequestData(NamedTuple):
llm: LLM
prompt: str
stop_token_ids: Optional[List[str]]
image_data: List[Image]
chat_template: Optional[str]
def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat"
llm = LLM(
model=model_name,
......@@ -48,24 +57,50 @@ def load_qwenvl_chat(question: str, image_urls: List[str]):
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids, None, chat_template
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=chat_template,
)
def load_phi3v(question: str, image_urls: List[str]):
def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance, because it may scale the image more in
# the image preprocessing. Some references in the model docs and the
# formula for image tokens after the preprocessing
# transform can be found below.
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"num_crops": 4},
)
placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
stop_token_ids = None
return llm, prompt, stop_token_ids, None, None
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_internvl(question: str, image_urls: List[str]):
def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B"
llm = LLM(
......@@ -93,10 +128,16 @@ def load_internvl(question: str, image_urls: List[str]):
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids, None, None
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_qwen2_vl(question, image_urls: List[str]):
def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
except ModuleNotFoundError:
......@@ -143,7 +184,13 @@ def load_qwen2_vl(question, image_urls: List[str]):
else:
image_data, _ = process_vision_info(messages)
return llm, prompt, stop_token_ids, image_data, None
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data,
chat_template=None,
)
model_example_map = {
......@@ -155,20 +202,17 @@ model_example_map = {
def run_generate(model, question: str, image_urls: List[str]):
llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
question, image_urls)
if image_data is None:
image_data = [fetch_image(url) for url in image_urls]
req_data = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=stop_token_ids)
stop_token_ids=req_data.stop_token_ids)
outputs = llm.generate(
outputs = req_data.llm.generate(
{
"prompt": prompt,
"prompt": req_data.prompt,
"multi_modal_data": {
"image": image_data
"image": req_data.image_data
},
},
sampling_params=sampling_params)
......@@ -179,13 +223,12 @@ def run_generate(model, question: str, image_urls: List[str]):
def run_chat(model: str, question: str, image_urls: List[str]):
llm, _, stop_token_ids, _, chat_template = model_example_map[model](
question, image_urls)
req_data = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=stop_token_ids)
outputs = llm.chat(
stop_token_ids=req_data.stop_token_ids)
outputs = req_data.llm.chat(
[{
"role":
"user",
......@@ -203,7 +246,7 @@ def run_chat(model: str, question: str, image_urls: List[str]):
],
}],
sampling_params=sampling_params,
chat_template=chat_template,
chat_template=req_data.chat_template,
)
for o in outputs:
......
......@@ -38,7 +38,7 @@ chat_completion_from_url = client.chat.completions.create(
"content": [
{
"type": "text",
"text": "Whats in this image?"
"text": "What's in this image?"
},
{
"type": "image_url",
......@@ -75,7 +75,7 @@ chat_completion_from_base64 = client.chat.completions.create(
"content": [
{
"type": "text",
"text": "Whats in this image?"
"text": "What's in this image?"
},
{
"type": "image_url",
......
......@@ -159,7 +159,7 @@ echo 'vLLM codespell: Done'
# Lint specified files
lint() {
ruff "$@"
ruff check "$@"
}
# Lint files that differ from main branch. Ignores dirs that are not slated
......@@ -175,7 +175,7 @@ lint_changed() {
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
ruff
ruff check
fi
}
......
......@@ -4,7 +4,8 @@ requires = [
"cmake>=3.26",
"ninja",
"packaging",
"setuptools >= 49.4.0",
"setuptools>=61",
"setuptools-scm>=8.0",
"torch == 2.4.0",
"wheel",
"jinja2",
......@@ -19,6 +20,10 @@ exclude = [
"examples/fp8/quantizer/quantize.py"
]
[tool.ruff.lint.per-file-ignores]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
......@@ -42,6 +47,8 @@ ignore = [
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
]
[tool.mypy]
......
......@@ -2,7 +2,8 @@
cmake>=3.26
ninja
packaging
setuptools>=49.4.0
setuptools>=61
setuptools-scm>=8
torch==2.4.0
wheel
jinja2
......@@ -4,7 +4,7 @@ numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
transformers >= 4.45.0 # Required for Llama 3.2.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi < 0.113.0; python_version < '3.9'
......@@ -18,15 +18,16 @@ prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.6
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
outlines >= 0.0.43, < 0.1
typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
partial-json-parser # used for parsing partial JSON outputs
pyzmq
msgspec
gguf == 0.9.1
gguf == 0.10.0
importlib_metadata
mistral_common >= 1.4.0
mistral_common >= 1.4.3
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
......@@ -8,4 +8,3 @@ torch == 2.4.0
# These must be updated alongside torch
torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
vllm-flash-attn == 2.6.1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
......@@ -2,7 +2,7 @@
yapf==0.32.0
toml==0.10.2
tomli==2.0.1
ruff==0.1.5
ruff==0.6.5
codespell==2.3.0
isort==5.13.2
clang-format==18.1.5
......
......@@ -2,6 +2,6 @@
-r requirements-common.txt
# Dependencies for Neuron devices
transformers-neuronx >= 0.9.0
torch-neuronx >= 2.1.0
transformers-neuronx >= 0.12.0
torch-neuronx >= 2.1.2
neuronx-cc
......@@ -14,13 +14,14 @@ librosa # required for audio test
opencv-python # required for video test
peft
requests
ray[adag]>=2.35
ray[adag]==2.35
sentence-transformers # required for embedding
soundfile # required for audio test
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
datamodel_code_generator # required for minicpm3 test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
......@@ -29,5 +30,5 @@ matplotlib # required for qwen-vl test
aiohttp
# quantization
bitsandbytes==0.42.0
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.8
......@@ -3,9 +3,10 @@
setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
ray >= 2.9
# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
torch == 2.3.1+cxx11.abi
intel-extension-for-pytorch == 2.3.110+xpu
oneccl_bind_pt == 2.3.100+xpu
triton-xpu == 3.0.0b2
......@@ -5,7 +5,7 @@ import os
import re
import subprocess
import sys
import warnings
from pathlib import Path
from shutil import which
from typing import Dict, List
......@@ -13,6 +13,7 @@ import torch
from packaging.version import Version, parse
from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext
from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME
from typing import Optional, Union
......@@ -34,34 +35,6 @@ def load_module_from_path(module_name, path):
ROOT_DIR = os.path.dirname(__file__)
logger = logging.getLogger(__name__)
def embed_commit_hash():
try:
if "BUILDKITE_COMMIT" in os.environ:
# ci build
commit_id = os.environ["BUILDKITE_COMMIT"]
else:
commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
encoding="utf-8").strip()
commit_contents = f'__commit__ = "{commit_id}"\n'
version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
with open(version_file, "w", encoding="utf-8") as f:
f.write(commit_contents)
except subprocess.CalledProcessError as e:
warnings.warn(f"Failed to get commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
except Exception as e:
warnings.warn(f"Failed to embed commit hash:\n{e}",
RuntimeWarning,
stacklevel=2)
embed_commit_hash()
# cannot import envs directly because it depends on vllm,
# which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
......@@ -159,15 +132,8 @@ class cmake_build_ext(build_ext):
default_cfg = "Debug" if self.debug else "RelWithDebInfo"
cfg = envs.CMAKE_BUILD_TYPE or default_cfg
# where .so files will be written, should be the same for all extensions
# that use the same CMakeLists.txt.
outdir = os.path.abspath(
os.path.dirname(self.get_ext_fullpath(ext.name)))
cmake_args = [
'-DCMAKE_BUILD_TYPE={}'.format(cfg),
'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
'-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
]
......@@ -231,10 +197,12 @@ class cmake_build_ext(build_ext):
os.makedirs(self.build_temp)
targets = []
target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
"vllm_flash_attn.")
# Build all the extensions
for ext in self.extensions:
self.configure(ext)
targets.append(remove_prefix(ext.name, "vllm."))
targets.append(target_name(ext.name))
num_jobs, _ = self.compute_num_jobs()
......@@ -247,6 +215,43 @@ class cmake_build_ext(build_ext):
subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
# Install the libraries
for ext in self.extensions:
# Install the extension into the proper location
outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
# Skip if the install directory is the same as the build directory
if outdir == self.build_temp:
continue
# CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it.
prefix = outdir
for i in range(ext.name.count('.')):
prefix = prefix.parent
# prefix here should actually be the same for all components
install_args = [
"cmake", "--install", ".", "--prefix", prefix, "--component",
target_name(ext.name)
]
subprocess.check_call(install_args, cwd=self.build_temp)
def run(self):
# First, run the standard build_ext command to compile the extensions
super().run()
# copy vllm/vllm_flash_attn/*.py from self.build_lib to current
# directory so that they can be included in the editable build
import glob
files = glob.glob(
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
for file in files:
dst_file = os.path.join("vllm/vllm_flash_attn",
os.path.basename(file))
print(f"Copying {file} to {dst_file}")
self.copy_file(file, dst_file)
def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty"
......@@ -355,19 +360,6 @@ def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath)
def find_version(filepath: str) -> str:
"""Extract version information from the given filepath.
Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
"""
with open(filepath) as fp:
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
fp.read(), re.M)
if version_match:
return version_match.group(1)
raise RuntimeError("Unable to find version string.")
def get_sha(root: Union[str, Path]) -> str:
try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
......@@ -378,7 +370,7 @@ def get_sha(root: Union[str, Path]) -> str:
def get_version_add(sha: Optional[str] = None) -> str:
vllm_root = os.path.dirname(os.path.abspath(__file__))
add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py")
major, minor, *res = torch.__version__.split('.')
major, minor, _ = torch.__version__.split('.')
if add_git_version:
if sha != 'Unknown':
if sha is None:
......@@ -404,20 +396,18 @@ def get_version_add(sha: Optional[str] = None) -> str:
version += ".dtk" + rocm_version
new_version_content = f"""
import warnings
try:
import vllm.commit_id
__commit__ = vllm.commit_id.__commit__
__version__ = "0.6.2"
__version_tuple__ = (0, 6, 2)
__dcu_version__ = f'0.6.2+{version}
from vllm.version import __version__, __version_tuple__, __dcu_version__
except Exception as e:
import warnings
warnings.warn(f"Failed to read commit hash:\\n + str(e)",
RuntimeWarning,
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.6.1.post2"
__dcu_version__ = f'0.6.1.post2+{version}'
"""
with open(add_version_path, encoding="utf-8",mode="w") as file:
......@@ -434,37 +424,44 @@ def get_version():
def get_vllm_version() -> str:
# version = find_version(get_path("vllm", "version.py"))
if not _is_hip():
version = get_version(
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
)
sep = "+" if "+" not in version else "." # dev versions might contain +
if _no_device():
if envs.VLLM_TARGET_DEVICE == "empty":
version += "+empty"
version += f"{sep}empty"
elif _is_cuda():
cuda_version = str(get_nvcc_cuda_version())
if cuda_version != MAIN_CUDA_VERSION:
cuda_version_str = cuda_version.replace(".", "")[:3]
version += f"+cu{cuda_version_str}"
# skip this for source tarball, required for pypi
if "sdist" not in sys.argv:
version += f"{sep}cu{cuda_version_str}"
elif _is_hip():
# Get the HIP version
# hipcc_version = get_hipcc_rocm_version()
# if hipcc_version != MAIN_CUDA_VERSION:
# rocm_version_str = hipcc_version.replace(".", "")[:3]
# version += f"+rocm{rocm_version_str}"
# version += f"{sep}rocm{rocm_version_str}"
version = get_version()
elif _is_neuron():
# Get the Neuron version
neuron_version = str(get_neuronxcc_version())
if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}"
version += f"{sep}neuron{neuron_version_str}"
elif _is_openvino():
version += "+openvino"
version += f"{sep}openvino"
elif _is_tpu():
version += "+tpu"
version += f"{sep}tpu"
elif _is_cpu():
version += "+cpu"
version += f"{sep}cpu"
elif _is_xpu():
version += "+xpu"
version += f"{sep}xpu"
else:
raise RuntimeError("Unknown runtime environment")
......@@ -535,6 +532,13 @@ if _build_core_ext():
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
if _is_hip():
ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
if _is_cuda():
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
......
......@@ -26,6 +26,11 @@ class RequestOutput:
finished: bool = False
@dataclass
class MockModelConfig:
use_async_output_proc = True
class MockEngine:
def __init__(self):
......@@ -35,6 +40,7 @@ class MockEngine:
self.request_id = None
# Ugly, remove dependency when possible
self.parallel_config = ParallelConfig(1, 1, False)
self.model_config = MockModelConfig()
async def step_async(self, virtual_engine):
# PP size is 1, ignore virtual engine
......@@ -80,7 +86,7 @@ class MockAsyncLLMEngine(AsyncLLMEngine):
@pytest.mark.asyncio
async def test_new_requests_event():
engine = MockAsyncLLMEngine(worker_use_ray=False)
engine = MockAsyncLLMEngine()
engine.start_background_loop()
await asyncio.sleep(0.01)
assert engine.engine.step_calls == 0
......@@ -113,7 +119,7 @@ async def test_new_requests_event():
assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == old_step_calls + 1
engine = MockAsyncLLMEngine(worker_use_ray=True)
engine = MockAsyncLLMEngine()
assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None
......
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ..utils import VLLM_PATH, RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--enforce-eager",
"--chat-template",
str(chatml_jinja_path),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI):
models = await client.models.list()
models = models.data
served_model = models[0]
assert served_model.id == MODEL_NAME
assert all(model.root == MODEL_NAME for model in models)
@pytest.mark.asyncio
async def test_single_completion(client: openai.AsyncOpenAI):
completion = await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert len(completion.choices) == 1
assert len(completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 5
@pytest.mark.asyncio
async def test_single_chat_session(client: openai.AsyncOpenAI):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test single completion
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=55, total_tokens=65)
message = choice.message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
import os
import pytest
from vllm.compilation.backends import vllm_backend
from .utils import TEST_MODELS, check_full_graph_support
@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
def test_full_graph(model):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model="meta-llama/Meta-Llama-3-8B",
enforce_eager=True,
load_format="dummy")
llm.generate(prompts, sampling_params)
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
def test_full_graph(model_info, backend):
model = model_info[0]
model_kwargs = model_info[1]
check_full_graph_support(model, model_kwargs, backend, tp_size=1)
import pytest
from vllm.compilation.backends import vllm_backend
from vllm.utils import cuda_device_count_stateless
from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS_SMOKE, check_full_graph_support
@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
@fork_new_process_for_each_test
def test_full_graph_multi_gpu(model_info, tp_size, backend):
model = model_info[0]
model_kwargs = model_info[1]
# Skip the test if there are not enough CUDA devices.
if cuda_device_count_stateless() < tp_size:
pytest.skip("Not enough CUDA devices for the test.")
check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
import pytest
from vllm.compilation.backends import vllm_backend
from .utils import TEST_MODELS_SMOKE, check_full_graph_support
@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
def test_full_graph(model_info, backend):
model = model_info[0]
model_kwargs = model_info[1]
check_full_graph_support(model, model_kwargs, backend, tp_size=1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment