Merge tag 'v0.6.2' into v0.6.2-dev

539aa992 · zhuwenwen · 93872128 · 7193774b · 539aa992 · 539aa992
Commit 539aa992 authored Sep 27, 2024 by zhuwenwen
20 changed files
--- a/examples/offline_inference_chat.py
+++ b/examples/offline_inference_chat.py
@@ -39,6 +39,33 @@ outputs = llm.chat(conversation,
                   use_tqdm=False)
 print_outputs(outputs)

+# You can run batch inference with llm.chat API
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+conversations = [conversation for _ in range(10)]
+
+# We turn on tqdm progress bar to verify it's indeed running batch inference
+outputs = llm.chat(messages=conversations,
+                   sampling_params=sampling_params,
+                   use_tqdm=True)
+print_outputs(outputs)
+
 # A chat template can be optionally supplied.
 # If not, the model will use its default chat template.


--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -14,7 +14,8 @@ from vllm.utils import FlexibleArgumentParser


 # LLaVA-1.5
-def run_llava(question):
+def run_llava(question, modality):
+    assert modality == "image"

    prompt = f"USER: <image>\n{question}\nASSISTANT:"

@@ -24,7 +25,8 @@ def run_llava(question):


 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question):
+def run_llava_next(question, modality):
+    assert modality == "image"

    prompt = f"[INST] <image>\n{question} [/INST]"
    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
@@ -34,15 +36,35 @@ def run_llava_next(question):

 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question):
+def run_llava_next_video(question, modality):
+    assert modality == "video"
+
    prompt = f"USER: <video>\n{question} ASSISTANT:"
    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
    stop_token_ids = None
    return llm, prompt, stop_token_ids


+# LLaVA-OneVision
+def run_llava_onevision(question, modality):
+
+    if modality == "video":
+        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    elif modality == "image":
+        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+              max_model_len=32768)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Fuyu
-def run_fuyu(question):
+def run_fuyu(question, modality):
+    assert modality == "image"

    prompt = f"{question}\n"
    llm = LLM(model="adept/fuyu-8b")
@@ -51,7 +73,8 @@ def run_fuyu(question):


 # Phi-3-Vision
-def run_phi3v(question):
+def run_phi3v(question, modality):
+    assert modality == "image"

    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
    # Note: The default setting of max_num_seqs (256) and
@@ -60,17 +83,32 @@ def run_phi3v(question):

    # In this example, we override max_num_seqs to 5 while
    # keeping the original context length of 128k.
+
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    llm = LLM(
        model="microsoft/Phi-3-vision-128k-instruct",
        trust_remote_code=True,
        max_num_seqs=5,
+        mm_processor_kwargs={"num_crops": 16},
    )
    stop_token_ids = None
    return llm, prompt, stop_token_ids


 # PaliGemma
-def run_paligemma(question):
+def run_paligemma(question, modality):
+    assert modality == "image"

    # PaliGemma has special prompt format for VQA
    prompt = "caption en"
@@ -80,7 +118,8 @@ def run_paligemma(question):


 # Chameleon
-def run_chameleon(question):
+def run_chameleon(question, modality):
+    assert modality == "image"

    prompt = f"{question}<image>"
    llm = LLM(model="facebook/chameleon-7b")
@@ -89,7 +128,8 @@ def run_chameleon(question):


 # MiniCPM-V
-def run_minicpmv(question):
+def run_minicpmv(question, modality):
+    assert modality == "image"

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
@@ -129,7 +169,9 @@ def run_minicpmv(question):


 # InternVL
-def run_internvl(question):
+def run_internvl(question, modality):
+    assert modality == "image"
+
    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
@@ -155,7 +197,8 @@ def run_internvl(question):


 # BLIP-2
-def run_blip2(question):
+def run_blip2(question, modality):
+    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
@@ -166,7 +209,8 @@ def run_blip2(question):


 # Qwen
-def run_qwen_vl(question):
+def run_qwen_vl(question, modality):
+    assert modality == "image"

    llm = LLM(
        model="Qwen/Qwen-VL",
@@ -180,7 +224,9 @@ def run_qwen_vl(question):


 # Qwen2-VL
-def run_qwen2_vl(question):
+def run_qwen2_vl(question, modality):
+    assert modality == "image"
+
    model_name = "Qwen/Qwen2-VL-7B-Instruct"

    llm = LLM(
@@ -196,10 +242,34 @@ def run_qwen2_vl(question):
    return llm, prompt, stop_token_ids


+# LLama
+def run_mllama(question, modality):
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # The configuration below has been confirmed to launch on a
+    # single H100 GPU.
+    llm = LLM(
+        model=model_name,
+        max_num_seqs=16,
+        enforce_eager=True,
+    )
+
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
+    "llava-onevision": run_llava_onevision,
    "fuyu": run_fuyu,
    "phi3_v": run_phi3v,
    "paligemma": run_paligemma,
@@ -209,6 +279,7 @@ model_example_map = {
    "internvl_chat": run_internvl,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
+    "mllama": run_mllama,
 }


@@ -255,7 +326,7 @@ def main(args):
    data = mm_input["data"]
    question = mm_input["question"]

-    llm, prompt, stop_token_ids = model_example_map[model](question)
+    llm, prompt, stop_token_ids = model_example_map[model](question, modality)

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
@@ -306,6 +377,7 @@ if __name__ == "__main__":
    parser.add_argument('--modality',
                        type=str,
                        default="image",
+                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,

--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -4,8 +4,9 @@ multi-image input on vision language models, using the chat template defined
 by the model.
 """
 from argparse import Namespace
-from typing import List
+from typing import List, NamedTuple, Optional

+from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer

 from vllm import LLM, SamplingParams
@@ -19,7 +20,15 @@ IMAGE_URLS = [
 ]


-def load_qwenvl_chat(question: str, image_urls: List[str]):
+class ModelRequestData(NamedTuple):
+    llm: LLM
+    prompt: str
+    stop_token_ids: Optional[List[str]]
+    image_data: List[Image]
+    chat_template: Optional[str]
+
+
+def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
    llm = LLM(
        model=model_name,
@@ -48,24 +57,50 @@ def load_qwenvl_chat(question: str, image_urls: List[str]):

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids, None, chat_template
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=chat_template,
+    )


-def load_phi3v(question: str, image_urls: List[str]):
+def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
    )
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
-    return llm, prompt, stop_token_ids, None, None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )


-def load_internvl(question: str, image_urls: List[str]):
+def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
@@ -93,10 +128,16 @@ def load_internvl(question: str, image_urls: List[str]):
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

-    return llm, prompt, stop_token_ids, None, None
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )


-def load_qwen2_vl(question, image_urls: List[str]):
+def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@@ -143,7 +184,13 @@ def load_qwen2_vl(question, image_urls: List[str]):
    else:
        image_data, _ = process_vision_info(messages)

-    return llm, prompt, stop_token_ids, image_data, None
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=image_data,
+        chat_template=None,
+    )


 model_example_map = {
@@ -155,20 +202,17 @@ model_example_map = {


 def run_generate(model, question: str, image_urls: List[str]):
-    llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
-        question, image_urls)
-    if image_data is None:
-        image_data = [fetch_image(url) for url in image_urls]
+    req_data = model_example_map[model](question, image_urls)

    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)

-    outputs = llm.generate(
+    outputs = req_data.llm.generate(
        {
-            "prompt": prompt,
+            "prompt": req_data.prompt,
            "multi_modal_data": {
-                "image": image_data
+                "image": req_data.image_data
            },
        },
        sampling_params=sampling_params)
@@ -179,13 +223,12 @@ def run_generate(model, question: str, image_urls: List[str]):


 def run_chat(model: str, question: str, image_urls: List[str]):
-    llm, _, stop_token_ids, _, chat_template = model_example_map[model](
-        question, image_urls)
+    req_data = model_example_map[model](question, image_urls)

    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
-                                     stop_token_ids=stop_token_ids)
-    outputs = llm.chat(
+                                     stop_token_ids=req_data.stop_token_ids)
+    outputs = req_data.llm.chat(
        [{
            "role":
            "user",
@@ -203,7 +246,7 @@ def run_chat(model: str, question: str, image_urls: List[str]):
            ],
        }],
        sampling_params=sampling_params,
-        chat_template=chat_template,
+        chat_template=req_data.chat_template,
    )

    for o in outputs:

--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -38,7 +38,7 @@ chat_completion_from_url = client.chat.completions.create(
        "content": [
            {
                "type": "text",
-                "text": "What’s in this image?"
+                "text": "What's in this image?"
            },
            {
                "type": "image_url",
@@ -75,7 +75,7 @@ chat_completion_from_base64 = client.chat.completions.create(
        "content": [
            {
                "type": "text",
-                "text": "What’s in this image?"
+                "text": "What's in this image?"
            },
            {
                "type": "image_url",

--- a/format.sh
+++ b/format.sh
@@ -159,7 +159,7 @@ echo 'vLLM codespell: Done'

 # Lint specified files
 lint() {
-    ruff "$@"
+    ruff check "$@"
 }

 # Lint files that differ from main branch. Ignores dirs that are not slated
@@ -175,7 +175,7 @@ lint_changed() {

    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             ruff
+             ruff check
    fi

 }

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,8 @@ requires = [
    "cmake>=3.26",
    "ninja",
    "packaging",
-    "setuptools >= 49.4.0",
+    "setuptools>=61",
+    "setuptools-scm>=8.0",
    "torch == 2.4.0",
    "wheel",
    "jinja2",
@@ -19,6 +20,10 @@ exclude = [
    "examples/fp8/quantizer/quantize.py"
 ]

+[tool.ruff.lint.per-file-ignores]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
 [tool.ruff.lint]
 select = [
    # pycodestyle
@@ -42,6 +47,8 @@ ignore = [
    "E731",
    # Loop control variable not used within loop body
    "B007",
+    # f-string format
+    "UP032",
 ]

 [tool.mypy]

--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,7 +2,8 @@
 cmake>=3.26
 ninja
 packaging
-setuptools>=49.4.0
+setuptools>=61
+setuptools-scm>=8
 torch==2.4.0
 wheel
 jinja2
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -4,7 +4,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
+transformers >= 4.45.0  # Required for Llama 3.2.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi < 0.113.0; python_version < '3.9'
@@ -18,15 +18,16 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.6
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+outlines >= 0.0.43, < 0.1
 typing_extensions >= 4.10
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
-gguf == 0.9.1
+gguf == 0.10.0
 importlib_metadata
-mistral_common >= 1.4.0
+mistral_common >= 1.4.3
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
+setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -8,4 +8,3 @@ torch == 2.4.0
 # These must be updated alongside torch
 torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
-vllm-flash-attn == 2.6.1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -2,7 +2,7 @@
 yapf==0.32.0
 toml==0.10.2
 tomli==2.0.1
-ruff==0.1.5
+ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5

--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt

 # Dependencies for Neuron devices
-transformers-neuronx >= 0.9.0
-torch-neuronx >= 2.1.0
+transformers-neuronx >= 0.12.0
+torch-neuronx >= 2.1.2
 neuronx-cc
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,13 +14,14 @@ librosa # required for audio test
 opencv-python # required for video test
 peft
 requests
-ray[adag]>=2.35
+ray[adag]==2.35
 sentence-transformers # required for embedding
 soundfile # required for audio test
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
+datamodel_code_generator # required for minicpm3 test

 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
@@ -29,5 +30,5 @@ matplotlib # required for qwen-vl test
 aiohttp

 # quantization
-bitsandbytes==0.42.0
+bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.8
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -3,9 +3,10 @@

 setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.

-torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
-intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
-
-triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+ray >= 2.9
+# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+torch == 2.3.1+cxx11.abi
+intel-extension-for-pytorch == 2.3.110+xpu
+oneccl_bind_pt == 2.3.100+xpu

+triton-xpu == 3.0.0b2
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import os
 import re
 import subprocess
 import sys
-import warnings
+from pathlib import Path
 from shutil import which
 from typing import Dict, List

@@ -13,6 +13,7 @@ import torch
 from packaging.version import Version, parse
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
+from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME

 from typing import Optional, Union
@@ -34,34 +35,6 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)

-
-def embed_commit_hash():
-    try:
-        if "BUILDKITE_COMMIT" in os.environ:
-            # ci build
-            commit_id = os.environ["BUILDKITE_COMMIT"]
-        else:
-            commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
-                                                encoding="utf-8").strip()
-
-        commit_contents = f'__commit__ = "{commit_id}"\n'
-
-        version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
-        with open(version_file, "w", encoding="utf-8") as f:
-            f.write(commit_contents)
-
-    except subprocess.CalledProcessError as e:
-        warnings.warn(f"Failed to get commit hash:\n{e}",
-                      RuntimeWarning,
-                      stacklevel=2)
-    except Exception as e:
-        warnings.warn(f"Failed to embed commit hash:\n{e}",
-                      RuntimeWarning,
-                      stacklevel=2)
-
-
-embed_commit_hash()
-
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
@@ -159,15 +132,8 @@ class cmake_build_ext(build_ext):
        default_cfg = "Debug" if self.debug else "RelWithDebInfo"
        cfg = envs.CMAKE_BUILD_TYPE or default_cfg

-        # where .so files will be written, should be the same for all extensions
-        # that use the same CMakeLists.txt.
-        outdir = os.path.abspath(
-            os.path.dirname(self.get_ext_fullpath(ext.name)))
-
        cmake_args = [
            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
-            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
            '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
        ]

@@ -231,10 +197,12 @@ class cmake_build_ext(build_ext):
            os.makedirs(self.build_temp)

        targets = []
+        target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
+                                              "vllm_flash_attn.")
        # Build all the extensions
        for ext in self.extensions:
            self.configure(ext)
-            targets.append(remove_prefix(ext.name, "vllm."))
+            targets.append(target_name(ext.name))

        num_jobs, _ = self.compute_num_jobs()

@@ -247,6 +215,43 @@ class cmake_build_ext(build_ext):

        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)

+        # Install the libraries
+        for ext in self.extensions:
+            # Install the extension into the proper location
+            outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
+
+            # Skip if the install directory is the same as the build directory
+            if outdir == self.build_temp:
+                continue
+
+            # CMake appends the extension prefix to the install path,
+            # and outdir already contains that prefix, so we need to remove it.
+            prefix = outdir
+            for i in range(ext.name.count('.')):
+                prefix = prefix.parent
+
+            # prefix here should actually be the same for all components
+            install_args = [
+                "cmake", "--install", ".", "--prefix", prefix, "--component",
+                target_name(ext.name)
+            ]
+            subprocess.check_call(install_args, cwd=self.build_temp)
+
+    def run(self):
+        # First, run the standard build_ext command to compile the extensions
+        super().run()
+
+        # copy vllm/vllm_flash_attn/*.py from self.build_lib to current
+        # directory so that they can be included in the editable build
+        import glob
+        files = glob.glob(
+            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
+        for file in files:
+            dst_file = os.path.join("vllm/vllm_flash_attn",
+                                    os.path.basename(file))
+            print(f"Copying {file} to {dst_file}")
+            self.copy_file(file, dst_file)
+

 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"
@@ -355,19 +360,6 @@ def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


-def find_version(filepath: str) -> str:
-    """Extract version information from the given filepath.
-
-    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
-    """
-    with open(filepath) as fp:
-        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
-                                  fp.read(), re.M)
-        if version_match:
-            return version_match.group(1)
-        raise RuntimeError("Unable to find version string.")
-
-
 def get_sha(root: Union[str, Path]) -> str:
    try:
        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
@@ -378,7 +370,7 @@ def get_sha(root: Union[str, Path]) -> str:
 def get_version_add(sha: Optional[str] = None) -> str:
    vllm_root = os.path.dirname(os.path.abspath(__file__))
    add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py")
-    major, minor, *res = torch.__version__.split('.')
+    major, minor, _ = torch.__version__.split('.')
    if add_git_version:
        if sha != 'Unknown':
            if sha is None:
@@ -404,20 +396,18 @@ def get_version_add(sha: Optional[str] = None) -> str:
        version += ".dtk" + rocm_version
    
    new_version_content = f"""
-import warnings
-
 try:
-    import vllm.commit_id
-    __commit__ = vllm.commit_id.__commit__
+    __version__ = "0.6.2"
+    __version_tuple__ = (0, 6, 2)
+    __dcu_version__ = f'0.6.2+{version}
+    
+    from vllm.version import __version__, __version_tuple__, __dcu_version__
 except Exception as e:
+    import warnings
+
    warnings.warn(f"Failed to read commit hash:\\n + str(e)",
                  RuntimeWarning,
                  stacklevel=2)
-    __commit__ = "COMMIT_HASH_PLACEHOLDER"
-
-__version__ = "0.6.1.post2"
-__dcu_version__ = f'0.6.1.post2+{version}' 
-
 """
    
    with open(add_version_path, encoding="utf-8",mode="w") as file:
@@ -434,37 +424,44 @@ def get_version():


 def get_vllm_version() -> str:
-    # version = find_version(get_path("vllm", "version.py"))
+    if not _is_hip():
+        version = get_version(
+            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+        )
+
+        sep = "+" if "+" not in version else "."  # dev versions might contain +

    if _no_device():
        if envs.VLLM_TARGET_DEVICE == "empty":
-            version += "+empty"
+            version += f"{sep}empty"
    elif _is_cuda():
        cuda_version = str(get_nvcc_cuda_version())
        if cuda_version != MAIN_CUDA_VERSION:
            cuda_version_str = cuda_version.replace(".", "")[:3]
-            version += f"+cu{cuda_version_str}"
+            # skip this for source tarball, required for pypi
+            if "sdist" not in sys.argv:
+                version += f"{sep}cu{cuda_version_str}"
    elif _is_hip():
        # Get the HIP version
        # hipcc_version = get_hipcc_rocm_version()
        # if hipcc_version != MAIN_CUDA_VERSION:
        #     rocm_version_str = hipcc_version.replace(".", "")[:3]
-        #     version += f"+rocm{rocm_version_str}"
+        #     version += f"{sep}rocm{rocm_version_str}"
        version = get_version()
    elif _is_neuron():
        # Get the Neuron version
        neuron_version = str(get_neuronxcc_version())
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
-            version += f"+neuron{neuron_version_str}"
+            version += f"{sep}neuron{neuron_version_str}"
    elif _is_openvino():
-        version += "+openvino"
+        version += f"{sep}openvino"
    elif _is_tpu():
-        version += "+tpu"
+        version += f"{sep}tpu"
    elif _is_cpu():
-        version += "+cpu"
+        version += f"{sep}cpu"
    elif _is_xpu():
-        version += "+xpu"
+        version += f"{sep}xpu"
    else:
        raise RuntimeError("Unknown runtime environment")

@@ -535,6 +532,13 @@ if _build_core_ext():
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))

+if _is_hip():
+    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
+
+if _is_cuda():
+    ext_modules.append(
+        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
+
 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))


--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -26,6 +26,11 @@ class RequestOutput:
    finished: bool = False


+@dataclass
+class MockModelConfig:
+    use_async_output_proc = True
+
+
 class MockEngine:

    def __init__(self):
@@ -35,6 +40,7 @@ class MockEngine:
        self.request_id = None
        # Ugly, remove dependency when possible
        self.parallel_config = ParallelConfig(1, 1, False)
+        self.model_config = MockModelConfig()

    async def step_async(self, virtual_engine):
        # PP size is 1, ignore virtual engine
@@ -80,7 +86,7 @@ class MockAsyncLLMEngine(AsyncLLMEngine):

 @pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False)
+    engine = MockAsyncLLMEngine()
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0
@@ -113,7 +119,7 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1

-    engine = MockAsyncLLMEngine(worker_use_ray=True)
+    engine = MockAsyncLLMEngine()
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None

--- a/tests/async_engine/test_openapi_server.py
+++ b/tests/async_engine/test_openapi_server.py
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-
-from ..utils import VLLM_PATH, RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "facebook/opt-125m"
-chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
-assert chatml_jinja_path.exists()
-
-
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--max-model-len",
-        "2048",
-        "--enforce-eager",
-        "--chat-template",
-        str(chatml_jinja_path),
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI):
-    models = await client.models.list()
-    models = models.data
-    served_model = models[0]
-    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
-
-
-@pytest.mark.asyncio
-async def test_single_completion(client: openai.AsyncOpenAI):
-    completion = await client.completions.create(model=MODEL_NAME,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 5
-
-
-@pytest.mark.asyncio
-async def test_single_chat_session(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # test single completion
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
-
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=55, total_tokens=65)
-
-    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
-    messages.append({"role": "assistant", "content": message.content})
-
-    # test multi-turn dialogue
-    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/rpc/__init__.py
+++ b/tests/entrypoints/openai/rpc/__init__.py
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
-import os
-
 import pytest

+from vllm.compilation.backends import vllm_backend
+
+from .utils import TEST_MODELS, check_full_graph_support

-@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
-def test_full_graph(model):
-    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"

-    from vllm import LLM, SamplingParams
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B",
-              enforce_eager=True,
-              load_format="dummy")
-    llm.generate(prompts, sampling_params)
+@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.mark.parametrize("backend", ["eager", vllm_backend])
+def test_full_graph(model_info, backend):
+    model = model_info[0]
+    model_kwargs = model_info[1]
+    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
--- a/tests/compile/test_full_graph_multi_gpu.py
+++ b/tests/compile/test_full_graph_multi_gpu.py
+import pytest
+
+from vllm.compilation.backends import vllm_backend
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import fork_new_process_for_each_test
+from .utils import TEST_MODELS_SMOKE, check_full_graph_support
+
+
+@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("backend", ["eager", vllm_backend])
+@fork_new_process_for_each_test
+def test_full_graph_multi_gpu(model_info, tp_size, backend):
+    model = model_info[0]
+    model_kwargs = model_info[1]
+
+    # Skip the test if there are not enough CUDA devices.
+    if cuda_device_count_stateless() < tp_size:
+        pytest.skip("Not enough CUDA devices for the test.")
+
+    check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
--- a/tests/compile/test_full_graph_smoke.py
+++ b/tests/compile/test_full_graph_smoke.py
+import pytest
+
+from vllm.compilation.backends import vllm_backend
+
+from .utils import TEST_MODELS_SMOKE, check_full_graph_support
+
+
+@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
+@pytest.mark.parametrize("backend", ["eager", vllm_backend])
+def test_full_graph(model_info, backend):
+    model = model_info[0]
+    model_kwargs = model_info[1]
+    check_full_graph_support(model, model_kwargs, backend, tp_size=1)